rs_send_drop_to() is called during socket close. If it takes
m_rs_lock without disabling interrupts, then
rds_send_remove_from_sock() can run from the rx completion
handler and thus deadlock.
Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
net/rds/send.c | 6 +++---
1 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/net/rds/send.c b/net/rds/send.c
index 1b37364..104fe03 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -615,7 +615,7 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
{
struct rds_message *rm, *tmp;
struct rds_connection *conn;
- unsigned long flags;
+ unsigned long flags, flags2;
LIST_HEAD(list);
int wake = 0;
@@ -651,9 +651,9 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
list_for_each_entry(rm, &list, m_sock_item) {
/* We do this here rather than in the loop above, so that
* we don't have to nest m_rs_lock under rs->rs_lock */
- spin_lock(&rm->m_rs_lock);
+ spin_lock_irqsave(&rm->m_rs_lock, flags2);
rm->m_rs = NULL;
- spin_unlock(&rm->m_rs_lock);
+ spin_unlock_irqrestore(&rm->m_rs_lock, flags2);
/*
* If we see this flag cleared then we're *sure* that someone
--
1.5.6.3
--
Had some lingering instances of _iw_ variable names from when
the listen code was centralized into rdma_transport.c
Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
net/rds/rdma_transport.c | 12 ++++++------
1 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 7b19024..7d0f901 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -34,7 +34,7 @@
#include "rdma_transport.h"
-static struct rdma_cm_id *rds_iw_listen_id;
+static struct rdma_cm_id *rds_rdma_listen_id;
int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event)
@@ -161,7 +161,7 @@ static int __init rds_rdma_listen_init(void)
rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);
- rds_iw_listen_id = cm_id;
+ rds_rdma_listen_id = cm_id;
cm_id = NULL;
out:
if (cm_id)
@@ -171,10 +171,10 @@ out:
static void rds_rdma_listen_stop(void)
{
- if (rds_iw_listen_id) {
- rdsdebug("cm %p\n", rds_iw_listen_id);
- rdma_destroy_id(rds_iw_listen_id);
- rds_iw_listen_id = NULL;
+ if (rds_rdma_listen_id) {
+ rdsdebug("cm %p\n", rds_rdma_listen_id);
+ rdma_destroy_id(rds_rdma_listen_id);
+ rds_rdma_listen_id = NULL;
}
}
--
1.5.6.3
--
The first message to a remote node should prompt a new connection.
Even an RDMA op via CMSG. Therefore move CMSG parsing to after
connection establishment.
Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
net/rds/send.c | 10 +++++-----
1 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/net/rds/send.c b/net/rds/send.c
index 104fe03..a4a7f42 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -854,11 +854,6 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
rm->m_daddr = daddr;
- /* Parse any control messages the user may have included. */
- ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
- if (ret)
- goto out;
-
/* rds_conn_create has a spinlock that runs with IRQ off.
* Caching the conn in the socket helps a lot. */
if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
@@ -874,6 +869,11 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
rs->rs_conn = conn;
}
+ /* Parse any control messages the user may have included. */
+ ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
+ if (ret)
+ goto out;
+
if ((rm->m_rdma_cookie || rm->m_rdma_op)
&& conn->c_trans->xmit_rdma == NULL) {
if (printk_ratelimit())
--
1.5.6.3
--
From: Steve Wise <swise@opengridcomputing.com>
The RDS_LL_SEND_FULL bit should be set when we stop transmitted due to
flow control. Otherwise the send worker will keep trying as opposed to
sleeping until we unthrottle. Saves CPU.
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
net/rds/ib_send.c | 2 +-
net/rds/iw_send.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index cb6c52c..fa684b7 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -506,7 +506,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
flow_controlled++;
}
if (work_alloc == 0) {
- rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
rds_ib_stats_inc(s_ib_tx_throttle);
ret = -ENOMEM;
goto out;
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
index 22dd38f..626290b 100644
--- a/net/rds/iw_send.c
+++ b/net/rds/iw_send.c
@@ -549,7 +549,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
flow_controlled++;
}
if (work_alloc == 0) {
- rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+ set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
rds_iw_stats_inc(s_iw_tx_throttle);
ret = -ENOMEM;
goto out;
--
1.5.6.3
--
Putting the constant first is a supposed "best practice" that actually makes
the code harder to read.
Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
net/rds/rdma.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index eaeeb91..584eac3 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -155,7 +155,7 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
nr_pages, write, 0, pages, NULL);
up_read(&current->mm->mmap_sem);
- if (0 <= ret && (unsigned) ret < nr_pages) {
+ if (ret > 0 && (unsigned) ret < nr_pages) {
while (ret--)
put_page(pages[ret]);
ret = -EFAULT;
--
1.5.6.3
--
> - if (0 <= ret && (unsigned) ret < nr_pages) {
> + if (ret > 0 && (unsigned) ret < nr_pages) {
This is not an equivalent transformation -- the original code is true if
ret == 0, while the new code is false.
Also it seems you don't need the unsigned cast here, since the clause
before just checked that ret is positive?
- R.
--
True, but I'd bet the compiler will warn if we remove it. I'll try it tomorrow and see. Thanks! -- Regards -- Andy --
From: Andrew Grover <andy.grover@gmail.com> Andy, also please resubmit only the real honest-to-goodness bug fixes in this patch series. I don't want to see cleanups, or optimizations like the transformation over to using get_user_pages_fast(). You could have sent that kind of stuff to me weeks ago. Thanks. --
Use the new function that is simpler and faster.
Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
net/rds/info.c | 5 +----
net/rds/rdma.c | 5 +----
2 files changed, 2 insertions(+), 8 deletions(-)
diff --git a/net/rds/info.c b/net/rds/info.c
index 1d88553..62aeef3 100644
--- a/net/rds/info.c
+++ b/net/rds/info.c
@@ -188,10 +188,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
ret = -ENOMEM;
goto out;
}
- down_read(&current->mm->mmap_sem);
- ret = get_user_pages(current, current->mm, start, nr_pages, 1, 0,
- pages, NULL);
- up_read(&current->mm->mmap_sem);
+ ret = get_user_pages_fast(start, nr_pages, 1, pages);
if (ret != nr_pages) {
if (ret > 0)
nr_pages = ret;
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 584eac3..6ecea09 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -150,10 +150,7 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
{
int ret;
- down_read(&current->mm->mmap_sem);
- ret = get_user_pages(current, current->mm, user_addr,
- nr_pages, write, 0, pages, NULL);
- up_read(&current->mm->mmap_sem);
+ ret = get_user_pages_fast(user_addr, nr_pages, write, pages);
if (ret > 0 && (unsigned) ret < nr_pages) {
while (ret--)
--
1.5.6.3
--
From: Steve Wise <swise@opengridcomputing.com>
Fix hack that restricts the credit advertisement to 127.
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
net/rds/ib.h | 2 +-
net/rds/ib_recv.c | 2 +-
net/rds/ib_send.c | 8 ++++----
net/rds/iw.h | 2 +-
net/rds/iw_recv.c | 2 +-
net/rds/iw_send.c | 8 ++++----
net/rds/rds.h | 2 +-
7 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 8be563a..7ff9ea0 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -320,7 +320,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
- u32 *adv_credits, int need_posted);
+ u32 *adv_credits, int need_posted, int max_posted);
/* ib_stats.c */
DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 5061b55..71b032b 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -491,7 +491,7 @@ void rds_ib_attempt_ack(struct rds_ib_connection *ic)
}
/* Can we get a send credit? */
- if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0)) {
+ if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
rds_ib_stats_inc(s_ib_tx_throttle);
clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
return;
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index fa684b7..23bf830 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -311,7 +311,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
* and using atomic_cmpxchg when updating the two counters.
*/
int rds_ib_send_grab_credits(struct rds_ib_connection *ic,
- u32 wanted, u32 *adv_credits, int need_posted)
+ ...This fixes a bug where a connection was unexpectedly
not on *any* list while being destroyed. It also
cleans up some code duplication and regularizes some
function names.
* Grab appropriate lock in conn_free() and explain in comment
* Ensure via locking that a conn is never not on either
a dev's list or the nodev list
* Add rds_xx_remove_conn() to match rds_xx_add_conn()
* Make rds_xx_add_conn() return void
* Rename remove_{,nodev_}conns() to
destroy_{,nodev_}conns() and unify their implementation
in a helper function
* Document lock ordering as nodev conn_lock before
dev_conn_lock
Reported-by: Yosef Etigin <yosefe@voltaire.com>
Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
net/rds/ib.c | 5 +++--
net/rds/ib.h | 14 +++++++++++---
net/rds/ib_cm.c | 34 +++++++++++++++++++---------------
net/rds/ib_rdma.c | 43 +++++++++++++++++++++----------------------
net/rds/iw.c | 5 +++--
net/rds/iw.h | 14 +++++++++++---
net/rds/iw_cm.c | 35 +++++++++++++++++++----------------
net/rds/iw_rdma.c | 44 ++++++++++++++++++++++----------------------
8 files changed, 109 insertions(+), 85 deletions(-)
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 06a7b79..4933b38 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -51,6 +51,7 @@ MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
struct list_head rds_ib_devices;
+/* NOTE: if also grabbing ibdev lock, grab this first */
DEFINE_SPINLOCK(ib_nodev_conns_lock);
LIST_HEAD(ib_nodev_conns);
@@ -137,7 +138,7 @@ void rds_ib_remove_one(struct ib_device *device)
kfree(i_ipaddr);
}
- rds_ib_remove_conns(rds_ibdev);
+ rds_ib_destroy_conns(rds_ibdev);
if (rds_ibdev->mr_pool)
rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
@@ -249,7 +250,7 @@ static int rds_ib_laddr_check(__be32 addr)
void rds_ib_exit(void)
{
rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
- rds_ib_remove_nodev_conns();
+ rds_ib_destroy_nodev_conns();
...From: Steve Wise <swise@opengridcomputing.com>
Currently the recv ring low water mark is 1/4 the depth. Performance
measurements show that this limits iWARP throughput by flow controlling
the rds-stress senders. Setting it to 1/2 seems to max the T3
performance. I tried even higher levels but that didn't help and it
started to increase the rds thread cpu utilization.
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Andy Grover <andy.grover@oracle.com>
---
net/rds/ib_ring.c | 2 +-
net/rds/iw_ring.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c
index 99a6cca..ff97e8e 100644
--- a/net/rds/ib_ring.c
+++ b/net/rds/ib_ring.c
@@ -137,7 +137,7 @@ int rds_ib_ring_empty(struct rds_ib_work_ring *ring)
int rds_ib_ring_low(struct rds_ib_work_ring *ring)
{
- return __rds_ib_ring_used(ring) <= (ring->w_nr >> 2);
+ return __rds_ib_ring_used(ring) <= (ring->w_nr >> 1);
}
/*
diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c
index d422d4b..da8e3b6 100644
--- a/net/rds/iw_ring.c
+++ b/net/rds/iw_ring.c
@@ -137,7 +137,7 @@ int rds_iw_ring_empty(struct rds_iw_work_ring *ring)
int rds_iw_ring_low(struct rds_iw_work_ring *ring)
{
- return __rds_iw_ring_used(ring) <= (ring->w_nr >> 2);
+ return __rds_iw_ring_used(ring) <= (ring->w_nr >> 1);
}
--
1.5.6.3
--
