summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-06-28 16:43:10 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-06-28 16:43:10 -0700
commit3a8a670eeeaa40d87bd38a587438952741980c18 (patch)
treed5546d311271503eadf75b45d87e12720e72899f /fs
parent6a8cbd9253abc1bd0df4d60c4c24fa555190376d (diff)
parentae230642190a51b85656d6da2df744d534d59544 (diff)
Merge tag 'net-next-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking changes from Jakub Kicinski: "WiFi 7 and sendpage changes are the biggest pieces of work for this release. The latter will definitely require fixes but I think that we got it to a reasonable point. Core: - Rework the sendpage & splice implementations Instead of feeding data into sockets page by page extend sendmsg handlers to support taking a reference on the data, controlled by a new flag called MSG_SPLICE_PAGES Rework the handling of unexpected-end-of-file to invoke an additional callback instead of trying to predict what the right combination of MORE/NOTLAST flags is Remove the MSG_SENDPAGE_NOTLAST flag completely - Implement SCM_PIDFD, a new type of CMSG type analogous to SCM_CREDENTIALS, but it contains pidfd instead of plain pid - Enable socket busy polling with CONFIG_RT - Improve reliability and efficiency of reporting for ref_tracker - Auto-generate a user space C library for various Netlink families Protocols: - Allow TCP to shrink the advertised window when necessary, prevent sk_rcvbuf auto-tuning from growing the window all the way up to tcp_rmem[2] - Use per-VMA locking for "page-flipping" TCP receive zerocopy - Prepare TCP for device-to-device data transfers, by making sure that payloads are always attached to skbs as page frags - Make the backoff time for the first N TCP SYN retransmissions linear. Exponential backoff is unnecessarily conservative - Create a new MPTCP getsockopt to retrieve all info (MPTCP_FULL_INFO) - Avoid waking up applications using TLS sockets until we have a full record - Allow using kernel memory for protocol ioctl callbacks, paving the way to issuing ioctls over io_uring - Add nolocalbypass option to VxLAN, forcing packets to be fully encapsulated even if they are destined for a local IP address - Make TCPv4 use consistent hash in TIME_WAIT and SYN_RECV. Ensure in-kernel ECMP implementation (e.g. Open vSwitch) select the same link for all packets. Support L4 symmetric hashing in Open vSwitch - PPPoE: make number of hash bits configurable - Allow DNS to be overwritten by DHCPACK in the in-kernel DHCP client (ipconfig) - Add layer 2 miss indication and filtering, allowing higher layers (e.g. ACL filters) to make forwarding decisions based on whether packet matched forwarding state in lower devices (bridge) - Support matching on Connectivity Fault Management (CFM) packets - Hide the "link becomes ready" IPv6 messages by demoting their printk level to debug - HSR: don't enable promiscuous mode if device offloads the proto - Support active scanning in IEEE 802.15.4 - Continue work on Multi-Link Operation for WiFi 7 BPF: - Add precision propagation for subprogs and callbacks. This allows maintaining verification efficiency when subprograms are used, or in fact passing the verifier at all for complex programs, especially those using open-coded iterators - Improve BPF's {g,s}setsockopt() length handling. Previously BPF assumed the length is always equal to the amount of written data. But some protos allow passing a NULL buffer to discover what the output buffer *should* be, without writing anything - Accept dynptr memory as memory arguments passed to helpers - Add routing table ID to bpf_fib_lookup BPF helper - Support O_PATH FDs in BPF_OBJ_PIN and BPF_OBJ_GET commands - Drop bpf_capable() check in BPF_MAP_FREEZE command (used to mark maps as read-only) - Show target_{obj,btf}_id in tracing link fdinfo - Addition of several new kfuncs (most of the names are self-explanatory): - Add a set of new dynptr kfuncs: bpf_dynptr_adjust(), bpf_dynptr_is_null(), bpf_dynptr_is_rdonly(), bpf_dynptr_size() and bpf_dynptr_clone(). - bpf_task_under_cgroup() - bpf_sock_destroy() - force closing sockets - bpf_cpumask_first_and(), rework bpf_cpumask_any*() kfuncs Netfilter: - Relax set/map validation checks in nf_tables. Allow checking presence of an entry in a map without using the value - Increase ip_vs_conn_tab_bits range for 64BIT builds - Allow updating size of a set - Improve NAT tuple selection when connection is closing Driver API: - Integrate netdev with LED subsystem, to allow configuring HW "offloaded" blinking of LEDs based on link state and activity (i.e. packets coming in and out) - Support configuring rate selection pins of SFP modules - Factor Clause 73 auto-negotiation code out of the drivers, provide common helper routines - Add more fool-proof helpers for managing lifetime of MDIO devices associated with the PCS layer - Allow drivers to report advanced statistics related to Time Aware scheduler offload (taprio) - Allow opting out of VF statistics in link dump, to allow more VFs to fit into the message - Split devlink instance and devlink port operations New hardware / drivers: - Ethernet: - Synopsys EMAC4 IP support (stmmac) - Marvell 88E6361 8 port (5x1GE + 3x2.5GE) switches - Marvell 88E6250 7 port switches - Microchip LAN8650/1 Rev.B0 PHYs - MediaTek MT7981/MT7988 built-in 1GE PHY driver - WiFi: - Realtek RTL8192FU, 2.4 GHz, b/g/n mode, 2T2R, 300 Mbps - Realtek RTL8723DS (SDIO variant) - Realtek RTL8851BE - CAN: - Fintek F81604 Drivers: - Ethernet NICs: - Intel (100G, ice): - support dynamic interrupt allocation - use meta data match instead of VF MAC addr on slow-path - nVidia/Mellanox: - extend link aggregation to handle 4, rather than just 2 ports - spawn sub-functions without any features by default - OcteonTX2: - support HTB (Tx scheduling/QoS) offload - make RSS hash generation configurable - support selecting Rx queue using TC filters - Wangxun (ngbe/txgbe): - add basic Tx/Rx packet offloads - add phylink support (SFP/PCS control) - Freescale/NXP (enetc): - report TAPRIO packet statistics - Solarflare/AMD: - support matching on IP ToS and UDP source port of outer header - VxLAN and GENEVE tunnel encapsulation over IPv4 or IPv6 - add devlink dev info support for EF10 - Virtual NICs: - Microsoft vNIC: - size the Rx indirection table based on requested configuration - support VLAN tagging - Amazon vNIC: - try to reuse Rx buffers if not fully consumed, useful for ARM servers running with 16kB pages - Google vNIC: - support TCP segmentation of >64kB frames - Ethernet embedded switches: - Marvell (mv88e6xxx): - enable USXGMII (88E6191X) - Microchip: - lan966x: add support for Egress Stage 0 ACL engine - lan966x: support mapping packet priority to internal switch priority (based on PCP or DSCP) - Ethernet PHYs: - Broadcom PHYs: - support for Wake-on-LAN for BCM54210E/B50212E - report LPI counter - Microsemi PHYs: support RGMII delay configuration (VSC85xx) - Micrel PHYs: receive timestamp in the frame (LAN8841) - Realtek PHYs: support optional external PHY clock - Altera TSE PCS: merge the driver into Lynx PCS which it is a variant of - CAN: Kvaser PCIEcan: - support packet timestamping - WiFi: - Intel (iwlwifi): - major update for new firmware and Multi-Link Operation (MLO) - configuration rework to drop test devices and split the different families - support for segmented PNVM images and power tables - new vendor entries for PPAG (platform antenna gain) feature - Qualcomm 802.11ax (ath11k): - Multiple Basic Service Set Identifier (MBSSID) and Enhanced MBSSID Advertisement (EMA) support in AP mode - support factory test mode - RealTek (rtw89): - add RSSI based antenna diversity - support U-NII-4 channels on 5 GHz band - RealTek (rtl8xxxu): - AP mode support for 8188f - support USB RX aggregation for the newer chips" * tag 'net-next-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1602 commits) net: scm: introduce and use scm_recv_unix helper af_unix: Skip SCM_PIDFD if scm->pid is NULL. net: lan743x: Simplify comparison netlink: Add __sock_i_ino() for __netlink_diag_dump(). net: dsa: avoid suspicious RCU usage for synced VLAN-aware MAC addresses Revert "af_unix: Call scm_recv() only after scm_set_cred()." phylink: ReST-ify the phylink_pcs_neg_mode() kdoc libceph: Partially revert changes to support MSG_SPLICE_PAGES net: phy: mscc: fix packet loss due to RGMII delays net: mana: use vmalloc_array and vcalloc net: enetc: use vmalloc_array and vcalloc ionic: use vmalloc_array and vcalloc pds_core: use vmalloc_array and vcalloc gve: use vmalloc_array and vcalloc octeon_ep: use vmalloc_array and vcalloc net: usb: qmi_wwan: add u-blox 0x1312 composition perf trace: fix MSG_SPLICE_PAGES build error ipvlan: Fix return value of ipvlan_queue_xmit() netfilter: nf_tables: fix underflow in chain reference counter netfilter: nf_tables: unbind non-anonymous set if rule construction fails ...
Diffstat (limited to 'fs')
-rw-r--r--fs/dlm/lowcomms.c10
-rw-r--r--fs/netfs/iterator.c266
-rw-r--r--fs/nfsd/vfs.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c38
-rw-r--r--fs/smb/client/smb2ops.c4
-rw-r--r--fs/smb/client/smbdirect.c2
-rw-r--r--fs/splice.c205
7 files changed, 197 insertions, 330 deletions
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 3d3802c47b8b..5c12d8cdfc16 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1395,8 +1395,11 @@ int dlm_lowcomms_resend_msg(struct dlm_msg *msg)
/* Send a message */
static int send_to_sock(struct connection *con)
{
- const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
struct writequeue_entry *e;
+ struct bio_vec bvec;
+ struct msghdr msg = {
+ .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT | MSG_NOSIGNAL,
+ };
int len, offset, ret;
spin_lock_bh(&con->writequeue_lock);
@@ -1412,8 +1415,9 @@ static int send_to_sock(struct connection *con)
WARN_ON_ONCE(len == 0 && e->users == 0);
spin_unlock_bh(&con->writequeue_lock);
- ret = kernel_sendpage(con->sock, e->page, offset, len,
- msg_flags);
+ bvec_set_page(&bvec, e->page, len, offset);
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
+ ret = sock_sendmsg(con->sock, &msg);
trace_dlm_send(con->nodeid, ret);
if (ret == -EAGAIN || ret == 0) {
lock_sock(con->sock->sk);
diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c
index 8a4c86687429..2ff07ba655a0 100644
--- a/fs/netfs/iterator.c
+++ b/fs/netfs/iterator.c
@@ -101,269 +101,3 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
return npages;
}
EXPORT_SYMBOL_GPL(netfs_extract_user_iter);
-
-/*
- * Extract and pin a list of up to sg_max pages from UBUF- or IOVEC-class
- * iterators, and add them to the scatterlist.
- */
-static ssize_t netfs_extract_user_to_sg(struct iov_iter *iter,
- ssize_t maxsize,
- struct sg_table *sgtable,
- unsigned int sg_max,
- iov_iter_extraction_t extraction_flags)
-{
- struct scatterlist *sg = sgtable->sgl + sgtable->nents;
- struct page **pages;
- unsigned int npages;
- ssize_t ret = 0, res;
- size_t len, off;
-
- /* We decant the page list into the tail of the scatterlist */
- pages = (void *)sgtable->sgl + array_size(sg_max, sizeof(struct scatterlist));
- pages -= sg_max;
-
- do {
- res = iov_iter_extract_pages(iter, &pages, maxsize, sg_max,
- extraction_flags, &off);
- if (res < 0)
- goto failed;
-
- len = res;
- maxsize -= len;
- ret += len;
- npages = DIV_ROUND_UP(off + len, PAGE_SIZE);
- sg_max -= npages;
-
- for (; npages > 0; npages--) {
- struct page *page = *pages;
- size_t seg = min_t(size_t, PAGE_SIZE - off, len);
-
- *pages++ = NULL;
- sg_set_page(sg, page, seg, off);
- sgtable->nents++;
- sg++;
- len -= seg;
- off = 0;
- }
- } while (maxsize > 0 && sg_max > 0);
-
- return ret;
-
-failed:
- while (sgtable->nents > sgtable->orig_nents)
- put_page(sg_page(&sgtable->sgl[--sgtable->nents]));
- return res;
-}
-
-/*
- * Extract up to sg_max pages from a BVEC-type iterator and add them to the
- * scatterlist. The pages are not pinned.
- */
-static ssize_t netfs_extract_bvec_to_sg(struct iov_iter *iter,
- ssize_t maxsize,
- struct sg_table *sgtable,
- unsigned int sg_max,
- iov_iter_extraction_t extraction_flags)
-{
- const struct bio_vec *bv = iter->bvec;
- struct scatterlist *sg = sgtable->sgl + sgtable->nents;
- unsigned long start = iter->iov_offset;
- unsigned int i;
- ssize_t ret = 0;
-
- for (i = 0; i < iter->nr_segs; i++) {
- size_t off, len;
-
- len = bv[i].bv_len;
- if (start >= len) {
- start -= len;
- continue;
- }
-
- len = min_t(size_t, maxsize, len - start);
- off = bv[i].bv_offset + start;
-
- sg_set_page(sg, bv[i].bv_page, len, off);
- sgtable->nents++;
- sg++;
- sg_max--;
-
- ret += len;
- maxsize -= len;
- if (maxsize <= 0 || sg_max == 0)
- break;
- start = 0;
- }
-
- if (ret > 0)
- iov_iter_advance(iter, ret);
- return ret;
-}
-
-/*
- * Extract up to sg_max pages from a KVEC-type iterator and add them to the
- * scatterlist. This can deal with vmalloc'd buffers as well as kmalloc'd or
- * static buffers. The pages are not pinned.
- */
-static ssize_t netfs_extract_kvec_to_sg(struct iov_iter *iter,
- ssize_t maxsize,
- struct sg_table *sgtable,
- unsigned int sg_max,
- iov_iter_extraction_t extraction_flags)
-{
- const struct kvec *kv = iter->kvec;
- struct scatterlist *sg = sgtable->sgl + sgtable->nents;
- unsigned long start = iter->iov_offset;
- unsigned int i;
- ssize_t ret = 0;
-
- for (i = 0; i < iter->nr_segs; i++) {
- struct page *page;
- unsigned long kaddr;
- size_t off, len, seg;
-
- len = kv[i].iov_len;
- if (start >= len) {
- start -= len;
- continue;
- }
-
- kaddr = (unsigned long)kv[i].iov_base + start;
- off = kaddr & ~PAGE_MASK;
- len = min_t(size_t, maxsize, len - start);
- kaddr &= PAGE_MASK;
-
- maxsize -= len;
- ret += len;
- do {
- seg = min_t(size_t, len, PAGE_SIZE - off);
- if (is_vmalloc_or_module_addr((void *)kaddr))
- page = vmalloc_to_page((void *)kaddr);
- else
- page = virt_to_page(kaddr);
-
- sg_set_page(sg, page, len, off);
- sgtable->nents++;
- sg++;
- sg_max--;
-
- len -= seg;
- kaddr += PAGE_SIZE;
- off = 0;
- } while (len > 0 && sg_max > 0);
-
- if (maxsize <= 0 || sg_max == 0)
- break;
- start = 0;
- }
-
- if (ret > 0)
- iov_iter_advance(iter, ret);
- return ret;
-}
-
-/*
- * Extract up to sg_max folios from an XARRAY-type iterator and add them to
- * the scatterlist. The pages are not pinned.
- */
-static ssize_t netfs_extract_xarray_to_sg(struct iov_iter *iter,
- ssize_t maxsize,
- struct sg_table *sgtable,
- unsigned int sg_max,
- iov_iter_extraction_t extraction_flags)
-{
- struct scatterlist *sg = sgtable->sgl + sgtable->nents;
- struct xarray *xa = iter->xarray;
- struct folio *folio;
- loff_t start = iter->xarray_start + iter->iov_offset;
- pgoff_t index = start / PAGE_SIZE;
- ssize_t ret = 0;
- size_t offset, len;
- XA_STATE(xas, xa, index);
-
- rcu_read_lock();
-
- xas_for_each(&xas, folio, ULONG_MAX) {
- if (xas_retry(&xas, folio))
- continue;
- if (WARN_ON(xa_is_value(folio)))
- break;
- if (WARN_ON(folio_test_hugetlb(folio)))
- break;
-
- offset = offset_in_folio(folio, start);
- len = min_t(size_t, maxsize, folio_size(folio) - offset);
-
- sg_set_page(sg, folio_page(folio, 0), len, offset);
- sgtable->nents++;
- sg++;
- sg_max--;
-
- maxsize -= len;
- ret += len;
- if (maxsize <= 0 || sg_max == 0)
- break;
- }
-
- rcu_read_unlock();
- if (ret > 0)
- iov_iter_advance(iter, ret);
- return ret;
-}
-
-/**
- * netfs_extract_iter_to_sg - Extract pages from an iterator and add ot an sglist
- * @iter: The iterator to extract from
- * @maxsize: The amount of iterator to copy
- * @sgtable: The scatterlist table to fill in
- * @sg_max: Maximum number of elements in @sgtable that may be filled
- * @extraction_flags: Flags to qualify the request
- *
- * Extract the page fragments from the given amount of the source iterator and
- * add them to a scatterlist that refers to all of those bits, to a maximum
- * addition of @sg_max elements.
- *
- * The pages referred to by UBUF- and IOVEC-type iterators are extracted and
- * pinned; BVEC-, KVEC- and XARRAY-type are extracted but aren't pinned; PIPE-
- * and DISCARD-type are not supported.
- *
- * No end mark is placed on the scatterlist; that's left to the caller.
- *
- * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA
- * be allowed on the pages extracted.
- *
- * If successul, @sgtable->nents is updated to include the number of elements
- * added and the number of bytes added is returned. @sgtable->orig_nents is
- * left unaltered.
- *
- * The iov_iter_extract_mode() function should be used to query how cleanup
- * should be performed.
- */
-ssize_t netfs_extract_iter_to_sg(struct iov_iter *iter, size_t maxsize,
- struct sg_table *sgtable, unsigned int sg_max,
- iov_iter_extraction_t extraction_flags)
-{
- if (maxsize == 0)
- return 0;
-
- switch (iov_iter_type(iter)) {
- case ITER_UBUF:
- case ITER_IOVEC:
- return netfs_extract_user_to_sg(iter, maxsize, sgtable, sg_max,
- extraction_flags);
- case ITER_BVEC:
- return netfs_extract_bvec_to_sg(iter, maxsize, sgtable, sg_max,
- extraction_flags);
- case ITER_KVEC:
- return netfs_extract_kvec_to_sg(iter, maxsize, sgtable, sg_max,
- extraction_flags);
- case ITER_XARRAY:
- return netfs_extract_xarray_to_sg(iter, maxsize, sgtable, sg_max,
- extraction_flags);
- default:
- pr_err("%s(%u) unsupported\n", __func__, iov_iter_type(iter));
- WARN_ON_ONCE(1);
- return -EIO;
- }
-}
-EXPORT_SYMBOL_GPL(netfs_extract_iter_to_sg);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 59b7d60ae33e..8a2321d19194 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -938,7 +938,7 @@ nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags,
/*
* Grab and keep cached pages associated with a file in the svc_rqst
- * so that they can be passed to the network sendmsg/sendpage routines
+ * so that they can be passed to the network sendmsg routines
* directly. They will be released after the sending has completed.
*
* Return values: Number of bytes consumed, or -EIO if there are no
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index aecbd712a00c..960080753d3b 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -930,19 +930,22 @@ out:
}
static void o2net_sendpage(struct o2net_sock_container *sc,
- void *kmalloced_virt,
- size_t size)
+ void *virt, size_t size)
{
struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+ struct msghdr msg = {};
+ struct bio_vec bv;
ssize_t ret;
+ bvec_set_virt(&bv, virt, size);
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, size);
+
while (1) {
+ msg.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES;
mutex_lock(&sc->sc_send_lock);
- ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
- virt_to_page(kmalloced_virt),
- offset_in_page(kmalloced_virt),
- size, MSG_DONTWAIT);
+ ret = sock_sendmsg(sc->sc_sock, &msg);
mutex_unlock(&sc->sc_send_lock);
+
if (ret == size)
break;
if (ret == (ssize_t)-EAGAIN) {
@@ -2087,18 +2090,24 @@ void o2net_stop_listening(struct o2nm_node *node)
int o2net_init(void)
{
+ struct folio *folio;
+ void *p;
unsigned long i;
o2quo_init();
-
o2net_debugfs_init();
- o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL);
- o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
- o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
- if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp)
+ folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, 0);
+ if (!folio)
goto out;
+ p = folio_address(folio);
+ o2net_hand = p;
+ p += sizeof(struct o2net_handshake);
+ o2net_keep_req = p;
+ p += sizeof(struct o2net_msg);
+ o2net_keep_resp = p;
+
o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION);
o2net_hand->connector_id = cpu_to_be64(1);
@@ -2124,9 +2133,6 @@ int o2net_init(void)
return 0;
out:
- kfree(o2net_hand);
- kfree(o2net_keep_req);
- kfree(o2net_keep_resp);
o2net_debugfs_exit();
o2quo_exit();
return -ENOMEM;
@@ -2135,8 +2141,6 @@ out:
void o2net_exit(void)
{
o2quo_exit();
- kfree(o2net_hand);
- kfree(o2net_keep_req);
- kfree(o2net_keep_resp);
o2net_debugfs_exit();
+ folio_put(virt_to_folio(o2net_hand));
}
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index a8bb9d00d33a..5639d8c48570 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -4373,8 +4373,8 @@ static void *smb2_get_aead_req(struct crypto_aead *tfm, struct smb_rqst *rqst,
}
sgtable.orig_nents = sgtable.nents;
- rc = netfs_extract_iter_to_sg(iter, count, &sgtable,
- num_sgs - sgtable.nents, 0);
+ rc = extract_iter_to_sg(iter, count, &sgtable,
+ num_sgs - sgtable.nents, 0);
iov_iter_revert(iter, rc);
sgtable.orig_nents = sgtable.nents;
}
diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
index 0362ebd4fa0f..223e17c16b60 100644
--- a/fs/smb/client/smbdirect.c
+++ b/fs/smb/client/smbdirect.c
@@ -2227,7 +2227,7 @@ static int smbd_iter_to_mr(struct smbd_connection *info,
memset(sgt->sgl, 0, max_sg * sizeof(struct scatterlist));
- ret = netfs_extract_iter_to_sg(iter, iov_iter_count(iter), sgt, max_sg, 0);
+ ret = extract_iter_to_sg(iter, iov_iter_count(iter), sgt, max_sg, 0);
WARN_ON(ret < 0);
if (sgt->nents > 0)
sg_mark_end(&sgt->sgl[sgt->nents - 1]);
diff --git a/fs/splice.c b/fs/splice.c
index 7a9565d8ec4f..004eb1c4ce31 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -33,6 +33,7 @@
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/gfp.h>
+#include <linux/net.h>
#include <linux/socket.h>
#include <linux/sched/signal.h>
@@ -415,30 +416,6 @@ const struct pipe_buf_operations nosteal_pipe_buf_ops = {
};
EXPORT_SYMBOL(nosteal_pipe_buf_ops);
-/*
- * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
- * using sendpage(). Return the number of bytes sent.
- */
-static int pipe_to_sendpage(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf, struct splice_desc *sd)
-{
- struct file *file = sd->u.file;
- loff_t pos = sd->pos;
- int more;
-
- if (!likely(file->f_op->sendpage))
- return -EINVAL;
-
- more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
-
- if (sd->len < sd->total_len &&
- pipe_occupancy(pipe->head, pipe->tail) > 1)
- more |= MSG_SENDPAGE_NOTLAST;
-
- return file->f_op->sendpage(file, buf->page, buf->offset,
- sd->len, &pos, more);
-}
-
static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
{
smp_mb();
@@ -619,7 +596,7 @@ static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_des
* Description:
* This function does little more than loop over the pipe and call
* @actor to do the actual moving of a single struct pipe_buffer to
- * the desired destination. See pipe_to_file, pipe_to_sendpage, or
+ * the desired destination. See pipe_to_file, pipe_to_sendmsg, or
* pipe_to_user.
*
*/
@@ -800,8 +777,9 @@ done:
EXPORT_SYMBOL(iter_file_splice_write);
+#ifdef CONFIG_NET
/**
- * generic_splice_sendpage - splice data from a pipe to a socket
+ * splice_to_socket - splice data from a pipe to a socket
* @pipe: pipe to splice from
* @out: socket to write to
* @ppos: position in @out
@@ -813,13 +791,129 @@ EXPORT_SYMBOL(iter_file_splice_write);
* is involved.
*
*/
-ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
- loff_t *ppos, size_t len, unsigned int flags)
+ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
+ loff_t *ppos, size_t len, unsigned int flags)
{
- return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
-}
+ struct socket *sock = sock_from_file(out);
+ struct bio_vec bvec[16];
+ struct msghdr msg = {};
+ ssize_t ret = 0;
+ size_t spliced = 0;
+ bool need_wakeup = false;
+
+ pipe_lock(pipe);
+
+ while (len > 0) {
+ unsigned int head, tail, mask, bc = 0;
+ size_t remain = len;
+
+ /*
+ * Check for signal early to make process killable when there
+ * are always buffers available
+ */
+ ret = -ERESTARTSYS;
+ if (signal_pending(current))
+ break;
+
+ while (pipe_empty(pipe->head, pipe->tail)) {
+ ret = 0;
+ if (!pipe->writers)
+ goto out;
+
+ if (spliced)
+ goto out;
+
+ ret = -EAGAIN;
+ if (flags & SPLICE_F_NONBLOCK)
+ goto out;
+
+ ret = -ERESTARTSYS;
+ if (signal_pending(current))
+ goto out;
+
+ if (need_wakeup) {
+ wakeup_pipe_writers(pipe);
+ need_wakeup = false;
+ }
+
+ pipe_wait_readable(pipe);
+ }
+
+ head = pipe->head;
+ tail = pipe->tail;
+ mask = pipe->ring_size - 1;
+
+ while (!pipe_empty(head, tail)) {
+ struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ size_t seg;
+
+ if (!buf->len) {
+ tail++;
+ continue;
+ }
+
+ seg = min_t(size_t, remain, buf->len);
+
+ ret = pipe_buf_confirm(pipe, buf);
+ if (unlikely(ret)) {
+ if (ret == -ENODATA)
+ ret = 0;
+ break;
+ }
-EXPORT_SYMBOL(generic_splice_sendpage);
+ bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset);
+ remain -= seg;
+ if (remain == 0 || bc >= ARRAY_SIZE(bvec))
+ break;
+ tail++;
+ }
+
+ if (!bc)
+ break;
+
+ msg.msg_flags = MSG_SPLICE_PAGES;
+ if (flags & SPLICE_F_MORE)
+ msg.msg_flags |= MSG_MORE;
+ if (remain && pipe_occupancy(pipe->head, tail) > 0)
+ msg.msg_flags |= MSG_MORE;
+
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc,
+ len - remain);
+ ret = sock_sendmsg(sock, &msg);
+ if (ret <= 0)
+ break;
+
+ spliced += ret;
+ len -= ret;
+ tail = pipe->tail;
+ while (ret > 0) {
+ struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ size_t seg = min_t(size_t, ret, buf->len);
+
+ buf->offset += seg;
+ buf->len -= seg;
+ ret -= seg;
+
+ if (!buf->len) {
+ pipe_buf_release(pipe, buf);
+ tail++;
+ }
+ }
+
+ if (tail != pipe->tail) {
+ pipe->tail = tail;
+ if (pipe->files)
+ need_wakeup = true;
+ }
+ }
+
+out:
+ pipe_unlock(pipe);
+ if (need_wakeup)
+ wakeup_pipe_writers(pipe);
+ return spliced ?: ret;
+}
+#endif
static int warn_unsupported(struct file *file, const char *op)
{
@@ -840,6 +934,17 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
return out->f_op->splice_write(pipe, out, ppos, len, flags);
}
+/*
+ * Indicate to the caller that there was a premature EOF when reading from the
+ * source and the caller didn't indicate they would be sending more data after
+ * this.
+ */
+static void do_splice_eof(struct splice_desc *sd)
+{
+ if (sd->splice_eof)
+ sd->splice_eof(sd);
+}
+
/**
* vfs_splice_read - Read data from a file and splice it into a pipe
* @in: File to splice from
@@ -944,13 +1049,17 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
*/
bytes = 0;
len = sd->total_len;
+
+ /* Don't block on output, we have to drain the direct pipe. */
flags = sd->flags;
+ sd->flags &= ~SPLICE_F_NONBLOCK;
/*
- * Don't block on output, we have to drain the direct pipe.
+ * We signal MORE until we've read sufficient data to fulfill the
+ * request and we keep signalling it if the caller set it.
*/
- sd->flags &= ~SPLICE_F_NONBLOCK;
more = sd->flags & SPLICE_F_MORE;
+ sd->flags |= SPLICE_F_MORE;
WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
@@ -960,20 +1069,18 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
ret = vfs_splice_read(in, &pos, pipe, len, flags);
if (unlikely(ret <= 0))
- goto out_release;
+ goto read_failure;
read_len = ret;
sd->total_len = read_len;
/*
- * If more data is pending, set SPLICE_F_MORE
- * If this is the last data and SPLICE_F_MORE was not set
- * initially, clears it.
+ * If we now have sufficient data to fulfill the request then
+ * we clear SPLICE_F_MORE if it was not set initially.
*/
- if (read_len < len)
- sd->flags |= SPLICE_F_MORE;
- else if (!more)
+ if (read_len >= len && !more)
sd->flags &= ~SPLICE_F_MORE;
+
/*
* NOTE: nonblocking mode only applies to the input. We
* must not do the output in nonblocking mode as then we
@@ -1000,6 +1107,15 @@ done:
file_accessed(in);
return bytes;
+read_failure:
+ /*
+ * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that
+ * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a
+ * "->splice_in()" that returned EOF (ie zero) *and* we have sent at
+ * least 1 byte *then* we will also do the ->splice_eof() call.
+ */
+ if (ret == 0 && !more && len > 0 && bytes)
+ do_splice_eof(sd);
out_release:
/*
* If we did an incomplete transfer we must release
@@ -1028,6 +1144,14 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
sd->flags);
}
+static void direct_file_splice_eof(struct splice_desc *sd)
+{
+ struct file *file = sd->u.file;
+
+ if (file->f_op->splice_eof)
+ file->f_op->splice_eof(file);
+}
+
/**
* do_splice_direct - splices data directly between two files
* @in: file to splice from
@@ -1053,6 +1177,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
.flags = flags,
.pos = *ppos,
.u.file = out,
+ .splice_eof = direct_file_splice_eof,
.opos = opos,
};
long ret;