struct rds_message *rm;
unsigned long flags;
unsigned int tmp;
- unsigned int send_quota = send_batch_count;
struct scatterlist *sg;
int ret = 0;
- int was_empty = 0;
LIST_HEAD(to_be_dropped);
+restart:
+ if (!rds_conn_up(conn))
+ goto out;
+
/*
* sendmsg calls here after having queued its message on the send
* queue. We only have one task feeding the connection at a time. If
* another thread is already feeding the queue then we back off. This
* avoids blocking the caller and trading per-connection data between
* caches per message.
- *
- * The sem holder will issue a retry if they notice that someone queued
- * a message after they stopped walking the send queue but before they
- * dropped the sem.
*/
- if (!mutex_trylock(&conn->c_send_lock)) {
- rds_stats_inc(s_send_sem_contention);
+ if (!spin_trylock_irqsave(&conn->c_send_lock, flags)) {
+ rds_stats_inc(s_send_lock_contention);
ret = -ENOMEM;
goto out;
}
* spin trying to push headers and data down the connection until
* the connection doesn't make forward progress.
*/
- while (--send_quota) {
+ while (1) {
rm = conn->c_xmit_rm;
/*
* If between sending messages, we can send a pending congestion
* map update.
- *
- * Transports either define a special xmit_cong_map function,
- * or we allocate a cong_map message and treat it just like any
- * other send.
*/
if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
- if (conn->c_trans->xmit_cong_map) {
- unsigned long map_offset = 0;
- unsigned long map_bytes = sizeof(struct rds_header) +
- RDS_CONG_MAP_BYTES;
-
- while (map_bytes) {
- ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
- map_offset);
- if (ret <= 0) {
- /* too far down the rabbithole! */
- mutex_unlock(&conn->c_send_lock);
- rds_conn_error(conn, "Cong map xmit failed\n");
- goto out;
- }
-
- map_offset += ret;
- map_bytes -= ret;
- }
- } else {
- /* send cong update like a normal rm */
- rm = rds_cong_update_alloc(conn);
- if (IS_ERR(rm)) {
- ret = PTR_ERR(rm);
- break;
- }
- rm->data.op_active = 1;
-
- conn->c_xmit_rm = rm;
+ rm = rds_cong_update_alloc(conn);
+ if (IS_ERR(rm)) {
+ ret = PTR_ERR(rm);
+ break;
}
+ rm->data.op_active = 1;
+
+ conn->c_xmit_rm = rm;
}
/*
if (!rm) {
unsigned int len;
- spin_lock_irqsave(&conn->c_lock, flags);
+ spin_lock(&conn->c_lock);
if (!list_empty(&conn->c_send_queue)) {
rm = list_entry(conn->c_send_queue.next,
list_move_tail(&rm->m_conn_item, &conn->c_retrans);
}
- spin_unlock_irqrestore(&conn->c_lock, flags);
+ spin_unlock(&conn->c_lock);
- if (!rm) {
- was_empty = 1;
+ if (!rm)
break;
- }
/* Unfortunately, the way Infiniband deals with
* RDMA to a bad MR key is by moving the entire
*/
if (rm->rdma.op_active &&
test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
- spin_lock_irqsave(&conn->c_lock, flags);
+ spin_lock(&conn->c_lock);
if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
list_move(&rm->m_conn_item, &to_be_dropped);
- spin_unlock_irqrestore(&conn->c_lock, flags);
- rds_message_put(rm);
+ spin_unlock(&conn->c_lock);
continue;
}
/* The transport either sends the whole rdma or none of it */
if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
+ rm->m_final_op = &rm->rdma;
ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
if (ret)
break;
}
if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
- ret = conn->c_trans->xmit_atomic(conn, rm);
+ rm->m_final_op = &rm->atomic;
+ ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
if (ret)
break;
conn->c_xmit_atomic_sent = 1;
+
/* The transport owns the mapped memory for now.
* You can't unmap it while it's on the send queue */
set_bit(RDS_MSG_MAPPED, &rm->m_flags);
}
if (rm->data.op_active && !conn->c_xmit_data_sent) {
+ rm->m_final_op = &rm->data;
ret = conn->c_trans->xmit(conn, rm,
conn->c_xmit_hdr_off,
conn->c_xmit_sg,
}
}
- /* Nuke any messages we decided not to retransmit. */
- if (!list_empty(&to_be_dropped))
- rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
-
if (conn->c_trans->xmit_complete)
conn->c_trans->xmit_complete(conn);
* stop processing the loop when the transport hasn't taken
* responsibility for forward progress.
*/
- mutex_unlock(&conn->c_send_lock);
+ spin_unlock_irqrestore(&conn->c_send_lock, flags);
- if (send_quota == 0 && !was_empty) {
- /* We exhausted the send quota, but there's work left to
- * do. Return and (re-)schedule the send worker.
- */
- ret = -EAGAIN;
+ /* Nuke any messages we decided not to retransmit. */
+ if (!list_empty(&to_be_dropped)) {
+ /* irqs on here, so we can put(), unlike above */
+ list_for_each_entry(rm, &to_be_dropped, m_conn_item)
+ rds_message_put(rm);
+ rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
}
- if (ret == 0 && was_empty) {
+ /*
+ * Other senders will see we have c_send_lock and exit. We
+ * need to recheck the send queue and race again for c_send_lock
+ * to make sure messages don't just sit on the send queue.
+ *
+ * If the transport cannot continue (i.e ret != 0), then it must
+ * call us when more room is available, such as from the tx
+ * completion handler.
+ */
+ if (ret == 0) {
/* A simple bit test would be way faster than taking the
* spin lock */
spin_lock_irqsave(&conn->c_lock, flags);
if (!list_empty(&conn->c_send_queue)) {
- rds_stats_inc(s_send_sem_queue_raced);
- ret = -EAGAIN;
+ rds_stats_inc(s_send_lock_queue_raced);
+ spin_unlock_irqrestore(&conn->c_lock, flags);
+ goto restart;
}
spin_unlock_irqrestore(&conn->c_lock, flags);
}
struct rds_sock *rs = NULL;
struct rm_atomic_op *ao;
struct rds_notifier *notifier;
+ unsigned long flags;
- spin_lock(&rm->m_rs_lock);
+ spin_lock_irqsave(&rm->m_rs_lock, flags);
ao = &rm->atomic;
if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
ao->op_notifier = NULL;
}
- spin_unlock(&rm->m_rs_lock);
+ spin_unlock_irqrestore(&rm->m_rs_lock, flags);
if (rs) {
rds_wake_sk_sleep(rs);
{
struct cmsghdr *cmsg;
int size = 0;
+ int cmsg_groups = 0;
int retval;
for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
switch (cmsg->cmsg_type) {
case RDS_CMSG_RDMA_ARGS:
+ cmsg_groups |= 1;
retval = rds_rdma_extra_size(CMSG_DATA(cmsg));
if (retval < 0)
return retval;
size += retval;
+
break;
case RDS_CMSG_RDMA_DEST:
case RDS_CMSG_RDMA_MAP:
+ cmsg_groups |= 2;
/* these are valid but do no add any size */
break;
case RDS_CMSG_ATOMIC_CSWP:
case RDS_CMSG_ATOMIC_FADD:
+ cmsg_groups |= 1;
size += sizeof(struct scatterlist);
break;
size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
+ /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
+ if (cmsg_groups == 3)
+ return -EINVAL;
+
return size;
}
rds_stats_inc(s_send_queued);
if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
- rds_send_worker(&conn->c_send_w.work);
+ rds_send_xmit(conn);
rds_message_put(rm);
return payload_len;