Merge branch 'splice-net-some-miscellaneous-msg_splice_pages-changes'
David Howells says:
====================
splice, net: Some miscellaneous MSG_SPLICE_PAGES changes
Now that the splice_to_socket() has been rewritten so that nothing now uses
the ->sendpage() file op[1], some further changes can be made, so here are
some miscellaneous changes that can now be done.
(1) Remove the ->sendpage() file op.
(2) Remove hash_sendpage*() from AF_ALG.
(3) Make sunrpc send multiple pages in single sendmsg() call rather than
calling sendpage() in TCP (or maybe TLS).
(4) Make tcp_bpf_sendpage() a wrapper around tcp_bpf_sendmsg().
(5) Make AF_KCM use sendmsg() when calling down to TCP and then make it
send entire fragment lists in single sendmsg calls.
Link: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=fd5f4d7da29218485153fd8b4c08da7fc130c79f [1]
====================
Link: https://lore.kernel.org/r/20230609100221.2620633-1-dhowells@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
pull/877/head
commit
7d4e87e973
|
|
@ -161,58 +161,6 @@ unlock_free:
|
|||
goto unlock;
|
||||
}
|
||||
|
||||
static ssize_t hash_sendpage(struct socket *sock, struct page *page,
|
||||
int offset, size_t size, int flags)
|
||||
{
|
||||
struct sock *sk = sock->sk;
|
||||
struct alg_sock *ask = alg_sk(sk);
|
||||
struct hash_ctx *ctx = ask->private;
|
||||
int err;
|
||||
|
||||
if (flags & MSG_SENDPAGE_NOTLAST)
|
||||
flags |= MSG_MORE;
|
||||
|
||||
lock_sock(sk);
|
||||
sg_init_table(ctx->sgl.sgl, 1);
|
||||
sg_set_page(ctx->sgl.sgl, page, size, offset);
|
||||
|
||||
if (!(flags & MSG_MORE)) {
|
||||
err = hash_alloc_result(sk, ctx);
|
||||
if (err)
|
||||
goto unlock;
|
||||
} else if (!ctx->more)
|
||||
hash_free_result(sk, ctx);
|
||||
|
||||
ahash_request_set_crypt(&ctx->req, ctx->sgl.sgl, ctx->result, size);
|
||||
|
||||
if (!(flags & MSG_MORE)) {
|
||||
if (ctx->more)
|
||||
err = crypto_ahash_finup(&ctx->req);
|
||||
else
|
||||
err = crypto_ahash_digest(&ctx->req);
|
||||
} else {
|
||||
if (!ctx->more) {
|
||||
err = crypto_ahash_init(&ctx->req);
|
||||
err = crypto_wait_req(err, &ctx->wait);
|
||||
if (err)
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
err = crypto_ahash_update(&ctx->req);
|
||||
}
|
||||
|
||||
err = crypto_wait_req(err, &ctx->wait);
|
||||
if (err)
|
||||
goto unlock;
|
||||
|
||||
ctx->more = flags & MSG_MORE;
|
||||
|
||||
unlock:
|
||||
release_sock(sk);
|
||||
|
||||
return err ?: size;
|
||||
}
|
||||
|
||||
static int hash_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
|
||||
int flags)
|
||||
{
|
||||
|
|
@ -328,7 +276,6 @@ static struct proto_ops algif_hash_ops = {
|
|||
|
||||
.release = af_alg_release,
|
||||
.sendmsg = hash_sendmsg,
|
||||
.sendpage = hash_sendpage,
|
||||
.recvmsg = hash_recvmsg,
|
||||
.accept = hash_accept,
|
||||
};
|
||||
|
|
@ -380,18 +327,6 @@ static int hash_sendmsg_nokey(struct socket *sock, struct msghdr *msg,
|
|||
return hash_sendmsg(sock, msg, size);
|
||||
}
|
||||
|
||||
static ssize_t hash_sendpage_nokey(struct socket *sock, struct page *page,
|
||||
int offset, size_t size, int flags)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = hash_check_key(sock);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
return hash_sendpage(sock, page, offset, size, flags);
|
||||
}
|
||||
|
||||
static int hash_recvmsg_nokey(struct socket *sock, struct msghdr *msg,
|
||||
size_t ignored, int flags)
|
||||
{
|
||||
|
|
@ -430,7 +365,6 @@ static struct proto_ops algif_hash_ops_nokey = {
|
|||
|
||||
.release = af_alg_release,
|
||||
.sendmsg = hash_sendmsg_nokey,
|
||||
.sendpage = hash_sendpage_nokey,
|
||||
.recvmsg = hash_recvmsg_nokey,
|
||||
.accept = hash_accept_nokey,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -1790,7 +1790,6 @@ struct file_operations {
|
|||
int (*fsync) (struct file *, loff_t, loff_t, int datasync);
|
||||
int (*fasync) (int, struct file *, int);
|
||||
int (*lock) (struct file *, int, struct file_lock *);
|
||||
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
|
||||
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
|
||||
int (*check_flags)(int);
|
||||
int (*flock) (struct file *, int, struct file_lock *);
|
||||
|
|
|
|||
|
|
@ -161,16 +161,15 @@ static inline bool svc_put_not_last(struct svc_serv *serv)
|
|||
extern u32 svc_max_payload(const struct svc_rqst *rqstp);
|
||||
|
||||
/*
|
||||
* RPC Requsts and replies are stored in one or more pages.
|
||||
* RPC Requests and replies are stored in one or more pages.
|
||||
* We maintain an array of pages for each server thread.
|
||||
* Requests are copied into these pages as they arrive. Remaining
|
||||
* pages are available to write the reply into.
|
||||
*
|
||||
* Pages are sent using ->sendpage so each server thread needs to
|
||||
* allocate more to replace those used in sending. To help keep track
|
||||
* of these pages we have a receive list where all pages initialy live,
|
||||
* and a send list where pages are moved to when there are to be part
|
||||
* of a reply.
|
||||
* Pages are sent using ->sendmsg with MSG_SPLICE_PAGES so each server thread
|
||||
* needs to allocate more to replace those used in sending. To help keep track
|
||||
* of these pages we have a receive list where all pages initialy live, and a
|
||||
* send list where pages are moved to when there are to be part of a reply.
|
||||
*
|
||||
* We use xdr_buf for holding responses as it fits well with NFS
|
||||
* read responses (that have a header, and some data pages, and possibly
|
||||
|
|
|
|||
|
|
@ -47,9 +47,9 @@ struct kcm_stats {
|
|||
|
||||
struct kcm_tx_msg {
|
||||
unsigned int sent;
|
||||
unsigned int fragidx;
|
||||
unsigned int frag_offset;
|
||||
unsigned int msg_flags;
|
||||
bool started_tx;
|
||||
struct sk_buff *frag_skb;
|
||||
struct sk_buff *last_skb;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -568,49 +568,18 @@ out_err:
|
|||
static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset,
|
||||
size_t size, int flags)
|
||||
{
|
||||
struct sk_msg tmp, *msg = NULL;
|
||||
int err = 0, copied = 0;
|
||||
struct sk_psock *psock;
|
||||
bool enospc = false;
|
||||
struct bio_vec bvec;
|
||||
struct msghdr msg = {
|
||||
.msg_flags = flags | MSG_SPLICE_PAGES,
|
||||
};
|
||||
|
||||
psock = sk_psock_get(sk);
|
||||
if (unlikely(!psock))
|
||||
return tcp_sendpage(sk, page, offset, size, flags);
|
||||
bvec_set_page(&bvec, page, size, offset);
|
||||
iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size);
|
||||
|
||||
lock_sock(sk);
|
||||
if (psock->cork) {
|
||||
msg = psock->cork;
|
||||
} else {
|
||||
msg = &tmp;
|
||||
sk_msg_init(msg);
|
||||
}
|
||||
if (flags & MSG_SENDPAGE_NOTLAST)
|
||||
msg.msg_flags |= MSG_MORE;
|
||||
|
||||
/* Catch case where ring is full and sendpage is stalled. */
|
||||
if (unlikely(sk_msg_full(msg)))
|
||||
goto out_err;
|
||||
|
||||
sk_msg_page_add(msg, page, size, offset);
|
||||
sk_mem_charge(sk, size);
|
||||
copied = size;
|
||||
if (sk_msg_full(msg))
|
||||
enospc = true;
|
||||
if (psock->cork_bytes) {
|
||||
if (size > psock->cork_bytes)
|
||||
psock->cork_bytes = 0;
|
||||
else
|
||||
psock->cork_bytes -= size;
|
||||
if (psock->cork_bytes && !enospc)
|
||||
goto out_err;
|
||||
/* All cork bytes are accounted, rerun the prog. */
|
||||
psock->eval = __SK_NONE;
|
||||
psock->cork_bytes = 0;
|
||||
}
|
||||
|
||||
err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags);
|
||||
out_err:
|
||||
release_sock(sk);
|
||||
sk_psock_put(sk, psock);
|
||||
return copied ? copied : err;
|
||||
return tcp_bpf_sendmsg(sk, &msg, size);
|
||||
}
|
||||
|
||||
enum {
|
||||
|
|
|
|||
|
|
@ -581,12 +581,10 @@ static void kcm_report_tx_retry(struct kcm_sock *kcm)
|
|||
*/
|
||||
static int kcm_write_msgs(struct kcm_sock *kcm)
|
||||
{
|
||||
unsigned int total_sent = 0;
|
||||
struct sock *sk = &kcm->sk;
|
||||
struct kcm_psock *psock;
|
||||
struct sk_buff *skb, *head;
|
||||
struct kcm_tx_msg *txm;
|
||||
unsigned short fragidx, frag_offset;
|
||||
unsigned int sent, total_sent = 0;
|
||||
struct sk_buff *head;
|
||||
int ret = 0;
|
||||
|
||||
kcm->tx_wait_more = false;
|
||||
|
|
@ -600,72 +598,57 @@ static int kcm_write_msgs(struct kcm_sock *kcm)
|
|||
if (skb_queue_empty(&sk->sk_write_queue))
|
||||
return 0;
|
||||
|
||||
kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0;
|
||||
|
||||
} else if (skb_queue_empty(&sk->sk_write_queue)) {
|
||||
return 0;
|
||||
kcm_tx_msg(skb_peek(&sk->sk_write_queue))->started_tx = false;
|
||||
}
|
||||
|
||||
head = skb_peek(&sk->sk_write_queue);
|
||||
txm = kcm_tx_msg(head);
|
||||
retry:
|
||||
while ((head = skb_peek(&sk->sk_write_queue))) {
|
||||
struct msghdr msg = {
|
||||
.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
|
||||
};
|
||||
struct kcm_tx_msg *txm = kcm_tx_msg(head);
|
||||
struct sk_buff *skb;
|
||||
unsigned int msize;
|
||||
int i;
|
||||
|
||||
if (txm->sent) {
|
||||
/* Send of first skbuff in queue already in progress */
|
||||
if (WARN_ON(!psock)) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
if (!txm->started_tx) {
|
||||
psock = reserve_psock(kcm);
|
||||
if (!psock)
|
||||
goto out;
|
||||
skb = head;
|
||||
txm->frag_offset = 0;
|
||||
txm->sent = 0;
|
||||
txm->started_tx = true;
|
||||
} else {
|
||||
if (WARN_ON(!psock)) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
skb = txm->frag_skb;
|
||||
}
|
||||
sent = txm->sent;
|
||||
frag_offset = txm->frag_offset;
|
||||
fragidx = txm->fragidx;
|
||||
skb = txm->frag_skb;
|
||||
|
||||
goto do_frag;
|
||||
}
|
||||
|
||||
try_again:
|
||||
psock = reserve_psock(kcm);
|
||||
if (!psock)
|
||||
goto out;
|
||||
|
||||
do {
|
||||
skb = head;
|
||||
txm = kcm_tx_msg(head);
|
||||
sent = 0;
|
||||
|
||||
do_frag_list:
|
||||
if (WARN_ON(!skb_shinfo(skb)->nr_frags)) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags;
|
||||
fragidx++) {
|
||||
skb_frag_t *frag;
|
||||
msize = 0;
|
||||
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
|
||||
msize += skb_shinfo(skb)->frags[i].bv_len;
|
||||
|
||||
frag_offset = 0;
|
||||
do_frag:
|
||||
frag = &skb_shinfo(skb)->frags[fragidx];
|
||||
if (WARN_ON(!skb_frag_size(frag))) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
iov_iter_bvec(&msg.msg_iter, ITER_SOURCE,
|
||||
skb_shinfo(skb)->frags, skb_shinfo(skb)->nr_frags,
|
||||
msize);
|
||||
iov_iter_advance(&msg.msg_iter, txm->frag_offset);
|
||||
|
||||
ret = kernel_sendpage(psock->sk->sk_socket,
|
||||
skb_frag_page(frag),
|
||||
skb_frag_off(frag) + frag_offset,
|
||||
skb_frag_size(frag) - frag_offset,
|
||||
MSG_DONTWAIT);
|
||||
do {
|
||||
ret = sock_sendmsg(psock->sk->sk_socket, &msg);
|
||||
if (ret <= 0) {
|
||||
if (ret == -EAGAIN) {
|
||||
/* Save state to try again when there's
|
||||
* write space on the socket
|
||||
*/
|
||||
txm->sent = sent;
|
||||
txm->frag_offset = frag_offset;
|
||||
txm->fragidx = fragidx;
|
||||
txm->frag_skb = skb;
|
||||
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
|
@ -679,39 +662,36 @@ do_frag:
|
|||
true);
|
||||
unreserve_psock(kcm);
|
||||
|
||||
txm->sent = 0;
|
||||
txm->started_tx = false;
|
||||
kcm_report_tx_retry(kcm);
|
||||
ret = 0;
|
||||
|
||||
goto try_again;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
sent += ret;
|
||||
frag_offset += ret;
|
||||
txm->sent += ret;
|
||||
txm->frag_offset += ret;
|
||||
KCM_STATS_ADD(psock->stats.tx_bytes, ret);
|
||||
if (frag_offset < skb_frag_size(frag)) {
|
||||
/* Not finished with this frag */
|
||||
goto do_frag;
|
||||
}
|
||||
}
|
||||
} while (msg.msg_iter.count > 0);
|
||||
|
||||
if (skb == head) {
|
||||
if (skb_has_frag_list(skb)) {
|
||||
skb = skb_shinfo(skb)->frag_list;
|
||||
goto do_frag_list;
|
||||
txm->frag_skb = skb_shinfo(skb)->frag_list;
|
||||
txm->frag_offset = 0;
|
||||
continue;
|
||||
}
|
||||
} else if (skb->next) {
|
||||
skb = skb->next;
|
||||
goto do_frag_list;
|
||||
txm->frag_skb = skb->next;
|
||||
txm->frag_offset = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Successfully sent the whole packet, account for it. */
|
||||
sk->sk_wmem_queued -= txm->sent;
|
||||
total_sent += txm->sent;
|
||||
skb_dequeue(&sk->sk_write_queue);
|
||||
kfree_skb(head);
|
||||
sk->sk_wmem_queued -= sent;
|
||||
total_sent += sent;
|
||||
KCM_STATS_INCR(psock->stats.tx_msgs);
|
||||
} while ((head = skb_peek(&sk->sk_write_queue)));
|
||||
}
|
||||
out:
|
||||
if (!head) {
|
||||
/* Done with all queued messages. */
|
||||
|
|
|
|||
|
|
@ -1203,13 +1203,14 @@ err_noclose:
|
|||
static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec,
|
||||
int flags)
|
||||
{
|
||||
return kernel_sendpage(sock, virt_to_page(vec->iov_base),
|
||||
offset_in_page(vec->iov_base),
|
||||
vec->iov_len, flags);
|
||||
struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES | flags, };
|
||||
|
||||
iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, vec, 1, vec->iov_len);
|
||||
return sock_sendmsg(sock, &msg);
|
||||
}
|
||||
|
||||
/*
|
||||
* kernel_sendpage() is used exclusively to reduce the number of
|
||||
* MSG_SPLICE_PAGES is used exclusively to reduce the number of
|
||||
* copy operations in this path. Therefore the caller must ensure
|
||||
* that the pages backing @xdr are unchanging.
|
||||
*
|
||||
|
|
@ -1249,28 +1250,13 @@ static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr,
|
|||
if (ret != head->iov_len)
|
||||
goto out;
|
||||
|
||||
if (xdr->page_len) {
|
||||
unsigned int offset, len, remaining;
|
||||
struct bio_vec *bvec;
|
||||
|
||||
bvec = xdr->bvec + (xdr->page_base >> PAGE_SHIFT);
|
||||
offset = offset_in_page(xdr->page_base);
|
||||
remaining = xdr->page_len;
|
||||
while (remaining > 0) {
|
||||
len = min(remaining, bvec->bv_len - offset);
|
||||
ret = kernel_sendpage(sock, bvec->bv_page,
|
||||
bvec->bv_offset + offset,
|
||||
len, 0);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
*sentp += ret;
|
||||
if (ret != len)
|
||||
goto out;
|
||||
remaining -= len;
|
||||
offset = 0;
|
||||
bvec++;
|
||||
}
|
||||
}
|
||||
msg.msg_flags = MSG_SPLICE_PAGES;
|
||||
iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, xdr->bvec,
|
||||
xdr_buf_pagecount(xdr), xdr->page_len);
|
||||
ret = sock_sendmsg(sock, &msg);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
*sentp += ret;
|
||||
|
||||
if (tail->iov_len) {
|
||||
ret = svc_tcp_send_kvec(sock, tail, 0);
|
||||
|
|
|
|||
Loading…
Reference in New Issue