diff options
author | David Howells <dhowells@redhat.com> | 2022-03-31 23:55:08 +0100 |
---|---|---|
committer | David Howells <dhowells@redhat.com> | 2022-11-08 16:42:28 +0000 |
commit | a4ea4c47761943d90cd5d1688b3c3c65922ff2b1 (patch) | |
tree | 6b4197ba2e1ec518a512f7aaf9fbda875457794f /net/rxrpc/input.c | |
parent | 5d7edbc9231ec6b60f9c5b7e7980e9a1cd92e6bb (diff) |
rxrpc: Don't use a ring buffer for call Tx queue
Change the way the Tx queueing works to make the following ends easier to
achieve:
(1) The filling of packets, the encryption of packets and the transmission
of packets can be handled in parallel by separate threads, rather than
rxrpc_sendmsg() allocating, filling, encrypting and transmitting each
packet before moving onto the next one.
(2) Get rid of the fixed-size ring which sets a hard limit on the number
of packets that can be retained in the ring. This allows the number
of packets to increase without having to allocate a very large ring or
having variable-sized rings.
[Note: the downside of this is that it's then less efficient to locate
a packet for retransmission as we then have to step through a list and
examine each buffer in the list.]
(3) Allow the filler/encrypter to run ahead of the transmission window.
(4) Make it easier to do zero copy UDP from the packet buffers.
(5) Make it easier to do zero copy from userspace to the packet buffers -
and thence to UDP (only if for unauthenticated connections).
To that end, the following changes are made:
(1) Use the new rxrpc_txbuf struct instead of sk_buff for keeping packets
to be transmitted in. This allows them to be placed on multiple
queues simultaneously. An sk_buff isn't really necessary as it's
never passed on to lower-level networking code.
(2) Keep the transmissable packets in a linked list on the call struct
rather than in a ring. As a consequence, the annotation buffer isn't
used either; rather a flag is set on the packet to indicate ackedness.
(3) Use the RXRPC_CALL_TX_LAST flag to indicate that the last packet to be
transmitted has been queued. Add RXRPC_CALL_TX_ALL_ACKED to indicate
that all packets up to and including the last got hard acked.
(4) Wire headers are now stored in the txbuf rather than being concocted
on the stack and they're stored immediately before the data, thereby
allowing zerocopy of a single span.
(5) Don't bother with instant-resend on transmission failure; rather,
leave it for a timer or an ACK packet to trigger.
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
Diffstat (limited to 'net/rxrpc/input.c')
-rw-r--r-- | net/rxrpc/input.c | 145 |
1 files changed, 61 insertions, 84 deletions
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 947e7196e2be..b1f7debd4f3e 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -32,7 +32,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, bool resend = false; summary->flight_size = - (call->tx_top - call->tx_hard_ack) - summary->nr_acks; + (call->tx_top - call->acks_hard_ack) - summary->nr_acks; if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) { summary->retrans_timeo = true; @@ -150,8 +150,8 @@ resume_normality: out: cumulative_acks = 0; out_no_clear_ca: - if (cwnd >= RXRPC_RXTX_BUFF_SIZE - 1) - cwnd = RXRPC_RXTX_BUFF_SIZE - 1; + if (cwnd >= RXRPC_TX_MAX_WINDOW) + cwnd = RXRPC_TX_MAX_WINDOW; call->cong_cwnd = cwnd; call->cong_cumul_acks = cumulative_acks; trace_rxrpc_congest(call, summary, acked_serial, change); @@ -169,9 +169,8 @@ send_extra_data: /* Send some previously unsent DATA if we have some to advance the ACK * state. */ - if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] & - RXRPC_TX_ANNO_LAST || - summary->nr_acks != call->tx_top - call->tx_hard_ack) { + if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) || + summary->nr_acks != call->tx_top - call->acks_hard_ack) { call->cong_extra++; wake_up(&call->waitq); } @@ -184,53 +183,40 @@ send_extra_data: static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, struct rxrpc_ack_summary *summary) { - struct sk_buff *skb, *list = NULL; + struct rxrpc_txbuf *txb; bool rot_last = false; - int ix; - u8 annotation; - if (call->acks_lowest_nak == call->tx_hard_ack) { - call->acks_lowest_nak = to; - } else if (before_eq(call->acks_lowest_nak, to)) { - summary->new_low_nack = true; - call->acks_lowest_nak = to; - } - - spin_lock(&call->lock); - - while (before(call->tx_hard_ack, to)) { - call->tx_hard_ack++; - ix = call->tx_hard_ack & RXRPC_RXTX_BUFF_MASK; - skb = call->rxtx_buffer[ix]; - annotation = call->rxtx_annotations[ix]; - rxrpc_see_skb(skb, rxrpc_skb_rotated); - call->rxtx_buffer[ix] = NULL; - call->rxtx_annotations[ix] = 0; - skb->next = list; - list = skb; - - if (annotation & RXRPC_TX_ANNO_LAST) { + list_for_each_entry_rcu(txb, &call->tx_buffer, call_link, false) { + if (before_eq(txb->seq, call->acks_hard_ack)) + continue; + if (!test_bit(RXRPC_TXBUF_ACKED, &txb->flags)) + summary->nr_rot_new_acks++; + if (test_bit(RXRPC_TXBUF_LAST, &txb->flags)) { set_bit(RXRPC_CALL_TX_LAST, &call->flags); rot_last = true; } - if ((annotation & RXRPC_TX_ANNO_MASK) != RXRPC_TX_ANNO_ACK) - summary->nr_rot_new_acks++; + if (txb->seq == to) + break; } - spin_unlock(&call->lock); + if (rot_last) + set_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags); - trace_rxrpc_transmit(call, (rot_last ? - rxrpc_transmit_rotate_last : - rxrpc_transmit_rotate)); - wake_up(&call->waitq); + _enter("%x,%x,%x,%d", to, call->acks_hard_ack, call->tx_top, rot_last); - while (list) { - skb = list; - list = skb->next; - skb_mark_not_on_list(skb); - rxrpc_free_skb(skb, rxrpc_skb_freed); + if (call->acks_lowest_nak == call->acks_hard_ack) { + call->acks_lowest_nak = to; + } else if (before_eq(call->acks_lowest_nak, to)) { + summary->new_low_nack = true; + call->acks_lowest_nak = to; } + smp_store_release(&call->acks_hard_ack, to); + + trace_rxrpc_txqueue(call, (rot_last ? + rxrpc_txqueue_rotate_last : + rxrpc_txqueue_rotate)); + wake_up(&call->waitq); return rot_last; } @@ -270,9 +256,9 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, write_unlock(&call->state_lock); if (state == RXRPC_CALL_CLIENT_AWAIT_REPLY) - trace_rxrpc_transmit(call, rxrpc_transmit_await_reply); + trace_rxrpc_txqueue(call, rxrpc_txqueue_await_reply); else - trace_rxrpc_transmit(call, rxrpc_transmit_end); + trace_rxrpc_txqueue(call, rxrpc_txqueue_end); _leave(" = ok"); return true; @@ -678,30 +664,21 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, */ static void rxrpc_input_check_for_lost_ack(struct rxrpc_call *call) { - rxrpc_seq_t top, bottom, seq; + struct rxrpc_txbuf *txb; + rxrpc_seq_t top, bottom; bool resend = false; - spin_lock_bh(&call->lock); - - bottom = call->tx_hard_ack + 1; - top = call->acks_lost_top; + bottom = READ_ONCE(call->acks_hard_ack) + 1; + top = READ_ONCE(call->acks_lost_top); if (before(bottom, top)) { - for (seq = bottom; before_eq(seq, top); seq++) { - int ix = seq & RXRPC_RXTX_BUFF_MASK; - u8 annotation = call->rxtx_annotations[ix]; - u8 anno_type = annotation & RXRPC_TX_ANNO_MASK; - - if (anno_type != RXRPC_TX_ANNO_UNACK) + list_for_each_entry_rcu(txb, &call->tx_buffer, call_link, false) { + if (test_bit(RXRPC_TXBUF_ACKED, &txb->flags)) continue; - annotation &= ~RXRPC_TX_ANNO_MASK; - annotation |= RXRPC_TX_ANNO_RETRANS; - call->rxtx_annotations[ix] = annotation; + set_bit(RXRPC_TXBUF_RETRANS, &txb->flags); resend = true; } } - spin_unlock_bh(&call->lock); - if (resend && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) rxrpc_queue_call(call); } @@ -735,8 +712,8 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU), rwind, ntohl(ackinfo->jumbo_max)); - if (rwind > RXRPC_RXTX_BUFF_SIZE - 1) - rwind = RXRPC_RXTX_BUFF_SIZE - 1; + if (rwind > RXRPC_TX_MAX_WINDOW) + rwind = RXRPC_TX_MAX_WINDOW; if (call->tx_winsize != rwind) { if (rwind > call->tx_winsize) wake = true; @@ -775,22 +752,24 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, rxrpc_seq_t seq, int nr_acks, struct rxrpc_ack_summary *summary) { - int ix; - u8 annotation, anno_type; - - for (; nr_acks > 0; nr_acks--, seq++) { - ix = seq & RXRPC_RXTX_BUFF_MASK; - annotation = call->rxtx_annotations[ix]; - anno_type = annotation & RXRPC_TX_ANNO_MASK; - annotation &= ~RXRPC_TX_ANNO_MASK; - switch (*acks++) { + struct rxrpc_txbuf *txb; + + list_for_each_entry_rcu(txb, &call->tx_buffer, call_link, false) { + if (before(txb->seq, seq)) + continue; + if (after_eq(txb->seq, seq + nr_acks)) + break; + switch (acks[txb->seq - seq]) { case RXRPC_ACK_TYPE_ACK: summary->nr_acks++; - if (anno_type == RXRPC_TX_ANNO_ACK) + if (test_bit(RXRPC_TXBUF_ACKED, &txb->flags)) continue; + /* A lot of the time the packet is going to + * have been ACK.'d already. + */ + clear_bit(RXRPC_TXBUF_NACKED, &txb->flags); + set_bit(RXRPC_TXBUF_ACKED, &txb->flags); summary->nr_new_acks++; - call->rxtx_annotations[ix] = - RXRPC_TX_ANNO_ACK | annotation; break; case RXRPC_ACK_TYPE_NACK: if (!summary->nr_nacks && @@ -799,13 +778,12 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, summary->new_low_nack = true; } summary->nr_nacks++; - if (anno_type == RXRPC_TX_ANNO_NAK) + if (test_bit(RXRPC_TXBUF_NACKED, &txb->flags)) continue; summary->nr_new_nacks++; - if (anno_type == RXRPC_TX_ANNO_RETRANS) - continue; - call->rxtx_annotations[ix] = - RXRPC_TX_ANNO_NAK | annotation; + clear_bit(RXRPC_TXBUF_ACKED, &txb->flags); + set_bit(RXRPC_TXBUF_NACKED, &txb->flags); + set_bit(RXRPC_TXBUF_RETRANS, &txb->flags); break; default: return rxrpc_proto_abort("SFT", call, 0); @@ -930,7 +908,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (unlikely(buf.ack.reason == RXRPC_ACK_OUT_OF_SEQUENCE) && first_soft_ack == 1 && prev_pkt == 0 && - call->tx_hard_ack == 0 && + call->acks_hard_ack == 0 && rxrpc_is_client_call(call)) { rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, 0, -ENETRESET); @@ -989,7 +967,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) goto out; } - if (before(hard_ack, call->tx_hard_ack) || + if (before(hard_ack, call->acks_hard_ack) || after(hard_ack, call->tx_top)) { rxrpc_proto_abort("AKW", call, 0); goto out; @@ -999,7 +977,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) goto out; } - if (after(hard_ack, call->tx_hard_ack)) { + if (after(hard_ack, call->acks_hard_ack)) { if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) { rxrpc_end_tx_phase(call, false, "ETA"); goto out; @@ -1015,8 +993,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) &summary); } - if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] & - RXRPC_TX_ANNO_LAST && + if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) && summary.nr_acks == call->tx_top - hard_ack && rxrpc_is_client_call(call)) rxrpc_propose_ping(call, ack_serial, |