diff options
50 files changed, 3218 insertions, 2350 deletions
diff --git a/block/bio.c b/block/bio.c index 2693f34afb7e..2e421c0dad13 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1236,11 +1236,11 @@ static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page, */ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { + iov_iter_extraction_t extraction_flags = 0; unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; - unsigned int gup_flags = 0; ssize_t size, left; unsigned len, i = 0; size_t offset, trim; @@ -1255,7 +1255,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue)) - gup_flags |= FOLL_PCI_P2PDMA; + extraction_flags |= ITER_ALLOW_P2PDMA; /* * Each segment in the iov is required to be a block size multiple. @@ -1266,7 +1266,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) */ size = iov_iter_get_pages(iter, pages, UINT_MAX - bio->bi_iter.bi_size, - nr_pages, &offset, gup_flags); + nr_pages, &offset, extraction_flags); if (unlikely(size <= 0)) return size ? size : -EFAULT; diff --git a/block/blk-map.c b/block/blk-map.c index 9ee4be4ba2f1..9137d16cecdc 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -264,9 +264,9 @@ static struct bio *blk_rq_map_bio_alloc(struct request *rq, static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, gfp_t gfp_mask) { + iov_iter_extraction_t extraction_flags = 0; unsigned int max_sectors = queue_max_hw_sectors(rq->q); unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS); - unsigned int gup_flags = 0; struct bio *bio; int ret; int j; @@ -279,7 +279,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, return -ENOMEM; if (blk_queue_pci_p2pdma(rq->q)) - gup_flags |= FOLL_PCI_P2PDMA; + extraction_flags |= ITER_ALLOW_P2PDMA; while (iov_iter_count(iter)) { struct page **pages, *stack_pages[UIO_FASTIOV]; @@ -290,10 +290,10 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, if (nr_vecs <= ARRAY_SIZE(stack_pages)) { pages = stack_pages; bytes = iov_iter_get_pages(iter, pages, LONG_MAX, - nr_vecs, &offs, gup_flags); + nr_vecs, &offs, extraction_flags); } else { bytes = iov_iter_get_pages_alloc(iter, &pages, - LONG_MAX, &offs, gup_flags); + LONG_MAX, &offs, extraction_flags); } if (unlikely(bytes <= 0)) { ret = bytes ? bytes : -EFAULT; diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 3b7e3b9e4fd2..4c0d53bf931a 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -18,40 +18,38 @@ config CIFS select DNS_RESOLVER select ASN1 select OID_REGISTRY + select NETFS_SUPPORT help - This is the client VFS module for the SMB3 family of NAS protocols, - (including support for the most recent, most secure dialect SMB3.1.1) - as well as for earlier dialects such as SMB2.1, SMB2 and the older - Common Internet File System (CIFS) protocol. CIFS was the successor - to the original dialect, the Server Message Block (SMB) protocol, the - native file sharing mechanism for most early PC operating systems. - - The SMB3 protocol is supported by most modern operating systems - and NAS appliances (e.g. Samba, Windows 10, Windows Server 2016, - MacOS) and even in the cloud (e.g. Microsoft Azure). - The older CIFS protocol was included in Windows NT4, 2000 and XP (and - later) as well by Samba (which provides excellent CIFS and SMB3 - server support for Linux and many other operating systems). Use of - dialects older than SMB2.1 is often discouraged on public networks. + This is the client VFS module for the SMB3 family of network file + protocols (including the most recent, most secure dialect SMB3.1.1). + This module also includes support for earlier dialects such as + SMB2.1, SMB2 and even the old Common Internet File System (CIFS) + protocol. CIFS was the successor to the original network filesystem + protocol, Server Message Block (SMB ie SMB1), the native file sharing + mechanism for most early PC operating systems. + + The SMB3.1.1 protocol is supported by most modern operating systems + and NAS appliances (e.g. Samba, Windows 11, Windows Server 2022, + MacOS) and even in the cloud (e.g. Microsoft Azure) and also by the + Linux kernel server, ksmbd. Support for the older CIFS protocol was + included in Windows NT4, 2000 and XP (and later). Use of dialects + older than SMB2.1 is often discouraged on public networks. This module also provides limited support for OS/2 and Windows ME and similar very old servers. - This module provides an advanced network file system client - for mounting to SMB3 (and CIFS) compliant servers. It includes - support for DFS (hierarchical name space), secure per-user - session establishment via Kerberos or NTLM or NTLMv2, RDMA - (smbdirect), advanced security features, per-share encryption, - directory leases, safe distributed caching (oplock), optional packet - signing, Unicode and other internationalization improvements. + This module provides an advanced network file system client for + mounting to SMB3 (and CIFS) compliant servers. It includes support + for DFS (hierarchical name space), secure per-user session + establishment via Kerberos or NTLMv2, RDMA (smbdirect), advanced + security features, per-share encryption, packet-signing, snapshots, + directory leases, safe distributed caching (leases), multichannel, + Unicode and other internationalization improvements. In general, the default dialects, SMB3 and later, enable better performance, security and features, than would be possible with CIFS. - Note that when mounting to Samba, due to the CIFS POSIX extensions, - CIFS mounts can provide slightly better POSIX compatibility - than SMB3 mounts. SMB2/SMB3 mount options are also - slightly simpler (compared to CIFS) due to protocol improvements. - If you need to mount to Samba, Azure, Macs or Windows from this machine, say Y. + If you need to mount to Samba, Azure, ksmbd, Macs or Windows from this + machine, say Y. config CIFS_STATS2 bool "Extended statistics" @@ -111,12 +109,12 @@ config CIFS_POSIX depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR help Enabling this option will cause the cifs client to attempt to - negotiate a newer dialect with servers, such as Samba 3.0.5 - or later, that optionally can handle more POSIX like (rather - than Windows like) file behavior. It also enables - support for POSIX ACLs (getfacl and setfacl) to servers - (such as Samba 3.10 and later) which can negotiate - CIFS POSIX ACL support. If unsure, say N. + negotiate a feature of the older cifs dialect with servers, such as + Samba 3.0.5 or later, that optionally can handle more POSIX like + (rather than Windows like) file behavior. It also enables support + for POSIX ACLs (getfacl and setfacl) to servers (such as Samba 3.10 + and later) which can negotiate CIFS POSIX ACL support. This config + option is not needed when mounting with SMB3.1.1. If unsure, say N. config CIFS_DEBUG bool "Enable CIFS debugging routines" @@ -178,6 +176,8 @@ config CIFS_NFSD_EXPORT help Allows NFS server to export a CIFS mounted share (nfsd over cifs) +if CIFS + config CIFS_SMB_DIRECT bool "SMB Direct support" depends on CIFS=m && INFINIBAND && INFINIBAND_ADDR_TRANS || CIFS=y && INFINIBAND=y && INFINIBAND_ADDR_TRANS=y @@ -201,3 +201,5 @@ config CIFS_ROOT Enables root file system support over SMB protocol. Most people say N here. + +endif diff --git a/fs/cifs/cached_dir.c b/fs/cifs/cached_dir.c index 60399081046a..75d5e06306ea 100644 --- a/fs/cifs/cached_dir.c +++ b/fs/cifs/cached_dir.c @@ -14,6 +14,7 @@ static struct cached_fid *init_cached_dir(const char *path); static void free_cached_dir(struct cached_fid *cfid); +static void smb2_close_cached_fid(struct kref *ref); static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, const char *path, @@ -181,12 +182,13 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, rqst[0].rq_iov = open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - oparms.tcon = tcon; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_FILE); - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.disposition = FILE_OPEN; - oparms.fid = pfid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_FILE), + .desired_access = FILE_READ_ATTRIBUTES, + .disposition = FILE_OPEN, + .fid = pfid, + }; rc = SMB2_open_init(tcon, server, &rqst[0], &oplock, &oparms, utf16_path); @@ -220,8 +222,8 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, } goto oshr_free; } - - atomic_inc(&tcon->num_remote_opens); + cfid->tcon = tcon; + cfid->is_open = true; o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; oparms.fid->persistent_fid = o_rsp->PersistentFileId; @@ -233,12 +235,12 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, if (o_rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE) goto oshr_free; - smb2_parse_contexts(server, o_rsp, &oparms.fid->epoch, oparms.fid->lease_key, &oplock, NULL, NULL); - + if (!(oplock & SMB2_LEASE_READ_CACHING_HE)) + goto oshr_free; qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info)) goto oshr_free; @@ -259,9 +261,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, } } cfid->dentry = dentry; - cfid->tcon = tcon; cfid->time = jiffies; - cfid->is_open = true; cfid->has_lease = true; oshr_free: @@ -271,7 +271,7 @@ oshr_free: free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); spin_lock(&cfids->cfid_list_lock); - if (!cfid->has_lease) { + if (rc && !cfid->has_lease) { if (cfid->on_list) { list_del(&cfid->entry); cfid->on_list = false; @@ -280,13 +280,27 @@ oshr_free: rc = -ENOENT; } spin_unlock(&cfids->cfid_list_lock); + if (!rc && !cfid->has_lease) { + /* + * We are guaranteed to have two references at this point. + * One for the caller and one for a potential lease. + * Release the Lease-ref so that the directory will be closed + * when the caller closes the cached handle. + */ + kref_put(&cfid->refcount, smb2_close_cached_fid); + } if (rc) { + if (cfid->is_open) + SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, + cfid->fid.volatile_fid); free_cached_dir(cfid); cfid = NULL; } - if (rc == 0) + if (rc == 0) { *ret_cfid = cfid; + atomic_inc(&tcon->num_remote_opens); + } return rc; } @@ -335,6 +349,7 @@ smb2_close_cached_fid(struct kref *ref) if (cfid->is_open) { SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, cfid->fid.volatile_fid); + atomic_dec(&cfid->tcon->num_remote_opens); } free_cached_dir(cfid); diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 56b23def4c95..1911f7016fa1 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -8,6 +8,7 @@ #include <linux/fs.h> #include <linux/string.h> #include <linux/ctype.h> +#include <linux/kstrtox.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/uaccess.h> @@ -455,8 +456,10 @@ skip_rdma: spin_lock(&ses->iface_lock); if (ses->iface_count) - seq_printf(m, "\n\n\tServer interfaces: %zu", - ses->iface_count); + seq_printf(m, "\n\n\tServer interfaces: %zu" + "\tLast updated: %lu seconds ago", + ses->iface_count, + (jiffies - ses->iface_last_update) / HZ); j = 0; list_for_each_entry(iface, &ses->iface_list, iface_head) { @@ -787,7 +790,7 @@ static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer, rc = get_user(c[0], buffer); if (rc) return rc; - if (strtobool(c, &bv) == 0) + if (kstrtobool(c, &bv) == 0) cifsFYI = bv; else if ((c[0] > '1') && (c[0] <= '9')) cifsFYI = (int) (c[0] - '0'); /* see cifs_debug.h for meanings */ @@ -947,7 +950,7 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, if (count < 3) { /* single char or single char followed by null */ - if (strtobool(flags_string, &bv) == 0) { + if (kstrtobool(flags_string, &bv) == 0) { global_secflags = bv ? CIFSSEC_MAX : CIFSSEC_DEF; return count; } else if (!isdigit(flags_string[0])) { diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h index 7f102ffeb675..e4d751b0c812 100644 --- a/fs/cifs/cifs_spnego.h +++ b/fs/cifs/cifs_spnego.h @@ -24,7 +24,7 @@ struct cifs_spnego_msg { uint32_t flags; uint32_t sesskey_len; uint32_t secblob_len; - uint8_t data[1]; + uint8_t data[]; }; #ifdef __KERNEL__ diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 9a2d390bd06f..f5b6df82e857 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1428,14 +1428,15 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, tcon = tlink_tcon(tlink); xid = get_xid(); - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = READ_CONTROL; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.disposition = FILE_OPEN; - oparms.path = path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = READ_CONTROL, + .create_options = cifs_create_options(cifs_sb, 0), + .disposition = FILE_OPEN, + .path = path, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (!rc) { @@ -1494,14 +1495,15 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, else access_flags = WRITE_DAC; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = access_flags; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.disposition = FILE_OPEN; - oparms.path = path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = access_flags, + .create_options = cifs_create_options(cifs_sb, 0), + .disposition = FILE_OPEN, + .path = path, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc) { diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index cbc18b4a9cb2..357bd27a7fd1 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -24,12 +24,156 @@ #include "../smbfs_common/arc4.h" #include <crypto/aead.h> +/* + * Hash data from a BVEC-type iterator. + */ +static int cifs_shash_bvec(const struct iov_iter *iter, ssize_t maxsize, + struct shash_desc *shash) +{ + const struct bio_vec *bv = iter->bvec; + unsigned long start = iter->iov_offset; + unsigned int i; + void *p; + int ret; + + for (i = 0; i < iter->nr_segs; i++) { + size_t off, len; + + len = bv[i].bv_len; + if (start >= len) { + start -= len; + continue; + } + + len = min_t(size_t, maxsize, len - start); + off = bv[i].bv_offset + start; + + p = kmap_local_page(bv[i].bv_page); + ret = crypto_shash_update(shash, p + off, len); + kunmap_local(p); + if (ret < 0) + return ret; + + maxsize -= len; + if (maxsize <= 0) + break; + start = 0; + } + + return 0; +} + +/* + * Hash data from a KVEC-type iterator. + */ +static int cifs_shash_kvec(const struct iov_iter *iter, ssize_t maxsize, + struct shash_desc *shash) +{ + const struct kvec *kv = iter->kvec; + unsigned long start = iter->iov_offset; + unsigned int i; + int ret; + + for (i = 0; i < iter->nr_segs; i++) { + size_t len; + + len = kv[i].iov_len; + if (start >= len) { + start -= len; + continue; + } + + len = min_t(size_t, maxsize, len - start); + ret = crypto_shash_update(shash, kv[i].iov_base + start, len); + if (ret < 0) + return ret; + maxsize -= len; + + if (maxsize <= 0) + break; + start = 0; + } + + return 0; +} + +/* + * Hash data from an XARRAY-type iterator. + */ +static ssize_t cifs_shash_xarray(const struct iov_iter *iter, ssize_t maxsize, + struct shash_desc *shash) +{ + struct folio *folios[16], *folio; + unsigned int nr, i, j, npages; + loff_t start = iter->xarray_start + iter->iov_offset; + pgoff_t last, index = start / PAGE_SIZE; + ssize_t ret = 0; + size_t len, offset, foffset; + void *p; + + if (maxsize == 0) + return 0; + + last = (start + maxsize - 1) / PAGE_SIZE; + do { + nr = xa_extract(iter->xarray, (void **)folios, index, last, + ARRAY_SIZE(folios), XA_PRESENT); + if (nr == 0) + return -EIO; + + for (i = 0; i < nr; i++) { + folio = folios[i]; + npages = folio_nr_pages(folio); + foffset = start - folio_pos(folio); + offset = foffset % PAGE_SIZE; + for (j = foffset / PAGE_SIZE; j < npages; j++) { + len = min_t(size_t, maxsize, PAGE_SIZE - offset); + p = kmap_local_page(folio_page(folio, j)); + ret = crypto_shash_update(shash, p, len); + kunmap_local(p); + if (ret < 0) + return ret; + maxsize -= len; + if (maxsize <= 0) + return 0; + start += len; + offset = 0; + index++; + } + } + } while (nr == ARRAY_SIZE(folios)); + return 0; +} + +/* + * Pass the data from an iterator into a hash. + */ +static int cifs_shash_iter(const struct iov_iter *iter, size_t maxsize, + struct shash_desc *shash) +{ + if (maxsize == 0) + return 0; + + switch (iov_iter_type(iter)) { + case ITER_BVEC: + return cifs_shash_bvec(iter, maxsize, shash); + case ITER_KVEC: + return cifs_shash_kvec(iter, maxsize, shash); + case ITER_XARRAY: + return cifs_shash_xarray(iter, maxsize, shash); + default: + pr_err("cifs_shash_iter(%u) unsupported\n", iov_iter_type(iter)); + WARN_ON_ONCE(1); + return -EIO; + } +} + int __cifs_calc_signature(struct smb_rqst *rqst, - struct TCP_Server_Info *server, char *signature, - struct shash_desc *shash) + struct TCP_Server_Info *server, char *signature, + struct shash_desc *shash) { int i; - int rc; + ssize_t rc; struct kvec *iov = rqst->rq_iov; int n_vec = rqst->rq_nvec; @@ -61,25 +205,9 @@ int __cifs_calc_signature(struct smb_rqst *rqst, } } - /* now hash over the rq_pages array */ - for (i = 0; i < rqst->rq_npages; i++) { - void *kaddr; - unsigned int len, offset; - - rqst_page_get_length(rqst, i, &len, &offset); - - kaddr = (char *) kmap(rqst->rq_pages[i]) + offset; - - rc = crypto_shash_update(shash, kaddr, len); - if (rc) { - cifs_dbg(VFS, "%s: Could not update with payload\n", - __func__); - kunmap(rqst->rq_pages[i]); - return rc; - } - - kunmap(rqst->rq_pages[i]); - } + rc = cifs_shash_iter(&rqst->rq_iter, iov_iter_count(&rqst->rq_iter), shash); + if (rc < 0) + return rc; rc = crypto_shash_final(shash, signature); if (rc) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index cb7c5460a80b..cbcf210d56e4 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1359,7 +1359,7 @@ const struct file_operations cifs_file_ops = { .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, - .splice_read = generic_file_splice_read, + .splice_read = cifs_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, @@ -1379,7 +1379,7 @@ const struct file_operations cifs_file_strict_ops = { .fsync = cifs_strict_fsync, .flush = cifs_flush, .mmap = cifs_file_strict_mmap, - .splice_read = generic_file_splice_read, + .splice_read = cifs_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, @@ -1399,7 +1399,7 @@ const struct file_operations cifs_file_direct_ops = { .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, - .splice_read = generic_file_splice_read, + .splice_read = direct_splice_read, .splice_write = iter_file_splice_write, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, @@ -1417,7 +1417,7 @@ const struct file_operations cifs_file_nobrl_ops = { .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, - .splice_read = generic_file_splice_read, + .splice_read = cifs_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, @@ -1435,7 +1435,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = { .fsync = cifs_strict_fsync, .flush = cifs_flush, .mmap = cifs_file_strict_mmap, - .splice_read = generic_file_splice_read, + .splice_read = cifs_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, @@ -1453,7 +1453,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = { .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, - .splice_read = generic_file_splice_read, + .splice_read = direct_splice_read, .splice_write = iter_file_splice_write, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index b58cd737b21e..71fe0a0a7992 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -100,6 +100,9 @@ extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to); extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from); extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from); extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from); +extern ssize_t cifs_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); extern int cifs_flock(struct file *pfile, int cmd, struct file_lock *plock); extern int cifs_lock(struct file *, int, struct file_lock *); extern int cifs_fsync(struct file *, loff_t, loff_t, int); @@ -110,6 +113,9 @@ extern int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma); extern const struct file_operations cifs_dir_ops; extern int cifs_dir_open(struct inode *inode, struct file *file); extern int cifs_readdir(struct file *file, struct dir_context *ctx); +extern void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len); +extern void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len); +extern void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int len); /* Functions related to dir entries */ extern const struct dentry_operations cifs_dentry_ops; @@ -154,5 +160,5 @@ extern const struct export_operations cifs_export_ops; /* when changing internal version - update following two lines at same time */ #define SMB3_PRODUCT_BUILD 41 -#define CIFS_VERSION "2.41" +#define CIFS_VERSION "2.42" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index cd8171a1c9a0..a99883f16d94 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -78,10 +78,6 @@ #define SMB_ECHO_INTERVAL_MAX 600 #define SMB_ECHO_INTERVAL_DEFAULT 60 -/* dns resolution intervals in seconds */ -#define SMB_DNS_RESOLVE_INTERVAL_MIN 120 -#define SMB_DNS_RESOLVE_INTERVAL_DEFAULT 600 - /* smb multichannel query server interfaces interval in seconds */ #define SMB_INTERFACE_POLL_INTERVAL 600 @@ -217,11 +213,9 @@ static inline void cifs_free_open_info(struct cifs_open_info_data *data) struct smb_rqst { struct kvec *rq_iov; /* array of kvecs */ unsigned int rq_nvec; /* number of kvecs in array */ - struct page **rq_pages; /* pointer to array of page ptrs */ - unsigned int rq_offset; /* the offset to the 1st page */ - unsigned int rq_npages; /* number pages in array */ - unsigned int rq_pagesz; /* page size to use */ - unsigned int rq_tailsz; /* length of last page */ + size_t rq_iter_size; /* Amount of data in ->rq_iter */ + struct iov_iter rq_iter; /* Data iterator */ + struct xarray rq_buffer; /* Page buffer for encryption */ }; struct mid_q_entry; @@ -692,7 +686,6 @@ struct TCP_Server_Info { /* point to the SMBD connection if RDMA is used instead of socket */ struct smbd_connection *smbd_conn; struct delayed_work echo; /* echo ping workqueue job */ - struct delayed_work resolve; /* dns resolution workqueue job */ char *smallbuf; /* pointer to current "small" buffer */ char *bigbuf; /* pointer to current "big" buffer */ /* Total size of this PDU. Only valid from cifs_demultiplex_thread */ @@ -1427,10 +1420,11 @@ struct cifs_aio_ctx { struct cifsFileInfo *cfile; struct bio_vec *bv; loff_t pos; - unsigned int npages; + unsigned int nr_pinned_pages; ssize_t rc; unsigned int len; unsigned int total_len; + unsigned int bv_need_unpin; /* If ->bv[] needs unpinning */ bool should_dirty; /* * Indicates if this aio_ctx is for direct_io, @@ -1448,28 +1442,18 @@ struct cifs_readdata { struct address_space *mapping; struct cifs_aio_ctx *ctx; __u64 offset; + ssize_t got_bytes; unsigned int bytes; - unsigned int got_bytes; pid_t pid; int result; struct work_struct work; - int (*read_into_pages)(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, - unsigned int len); - int (*copy_into_pages)(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, - struct iov_iter *iter); + struct iov_iter iter; struct kvec iov[2]; struct TCP_Server_Info *server; #ifdef CONFIG_CIFS_SMB_DIRECT struct smbd_mr *mr; #endif - unsigned int pagesz; - unsigned int page_offset; - unsigned int tailsz; struct cifs_credits credits; - unsigned int nr_pages; - struct page **pages; }; /* asynchronous write support */ @@ -1481,6 +1465,8 @@ struct cifs_writedata { struct work_struct work; struct cifsFileInfo *cfile; struct cifs_aio_ctx *ctx; + struct iov_iter iter; + struct bio_vec *bv; __u64 offset; pid_t pid; unsigned int bytes; @@ -1489,12 +1475,7 @@ struct cifs_writedata { #ifdef CONFIG_CIFS_SMB_DIRECT struct smbd_mr *mr; #endif - unsigned int pagesz; - unsigned int page_offset; - unsigned int tailsz; struct cifs_credits credits; - unsigned int nr_pages; - struct page **pages; }; /* @@ -2154,15 +2135,21 @@ static inline void move_cifs_info_to_smb2(struct smb2_file_all_info *dst, const dst->FileNameLength = src->FileNameLength; } -static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst, - int num_rqst, - const u8 *sig) +static inline int cifs_get_num_sgs(const struct smb_rqst *rqst, + int num_rqst, + const u8 *sig) { unsigned int len, skip; unsigned int nents = 0; unsigned long addr; int i, j; + /* + * The first rqst has a transform header where the first 20 bytes are + * not part of the encrypted blob. + */ + skip = 20; + /* Assumes the first rqst has a transform header as the first iov. * I.e. * rqst[0].rq_iov[0] is transform header @@ -2170,14 +2157,22 @@ static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst, * rqst[1+].rq_iov[0+] data to be encrypted/decrypted */ for (i = 0; i < num_rqst; i++) { - /* - * The first rqst has a transform header where the - * first 20 bytes are not part of the encrypted blob. + /* We really don't want a mixture of pinned and unpinned pages + * in the sglist. It's hard to keep track of which is what. + * Instead, we convert to a BVEC-type iterator higher up. */ + if (WARN_ON_ONCE(user_backed_iter(&rqst[i].rq_iter))) + return -EIO; + + /* We also don't want to have any extra refs or pins to clean + * up in the sglist. + */ + if (WARN_ON_ONCE(iov_iter_extract_will_pin(&rqst[i].rq_iter))) + return -EIO; + for (j = 0; j < rqst[i].rq_nvec; j++) { struct kvec *iov = &rqst[i].rq_iov[j]; - skip = (i == 0) && (j == 0) ? 20 : 0; addr = (unsigned long)iov->iov_base + skip; if (unlikely(is_vmalloc_addr((void *)addr))) { len = iov->iov_len - skip; @@ -2186,8 +2181,9 @@ static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst, } else { nents++; } + skip = 0; } - nents += rqst[i].rq_npages; + nents += iov_iter_npages(&rqst[i].rq_iter, INT_MAX); } nents += DIV_ROUND_UP(offset_in_page(sig) + SMB2_SIGNATURE_SIZE, PAGE_SIZE); return nents; @@ -2196,9 +2192,9 @@ static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst, /* We can not use the normal sg_set_buf() as we will sometimes pass a * stack object as buf. */ -static inline struct scatterlist *cifs_sg_set_buf(struct scatterlist *sg, - const void *buf, - unsigned int buflen) +static inline void cifs_sg_set_buf(struct sg_table *sgtable, + const void *buf, + unsigned int buflen) { unsigned long addr = (unsigned long)buf; unsigned int off = offset_in_page(addr); @@ -2208,16 +2204,17 @@ static inline struct scatterlist *cifs_sg_set_buf(struct scatterlist *sg, do { unsigned int len = min_t(unsigned int, buflen, PAGE_SIZE - off); - sg_set_page(sg++, vmalloc_to_page((void *)addr), len, off); + sg_set_page(&sgtable->sgl[sgtable->nents++], + vmalloc_to_page((void *)addr), len, off); off = 0; addr += PAGE_SIZE; buflen -= len; } while (buflen); } else { - sg_set_page(sg++, virt_to_page(addr), buflen, off); + sg_set_page(&sgtable->sgl[sgtable->nents++], + virt_to_page(addr), buflen, off); } - return sg; } #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 623caece2b10..445e3eaebcc1 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -562,7 +562,7 @@ typedef union smb_com_session_setup_andx { __u32 Reserved; __le32 Capabilities; /* see below */ __le16 ByteCount; - unsigned char SecurityBlob[1]; /* followed by */ + unsigned char SecurityBlob[]; /* followed by */ /* STRING NativeOS */ /* STRING NativeLanMan */ } __attribute__((packed)) req; /* NTLM request format (with @@ -582,7 +582,7 @@ typedef union smb_com_session_setup_andx { __u32 Reserved; /* see below */ __le32 Capabilities; __le16 ByteCount; - unsigned char CaseInsensitivePassword[1]; /* followed by: */ + unsigned char CaseInsensitivePassword[]; /* followed by: */ /* unsigned char * CaseSensitivePassword; */ /* STRING AccountName */ /* STRING PrimaryDomain */ @@ -599,7 +599,7 @@ typedef union smb_com_session_setup_andx { __le16 Action; /* see below */ __le16 SecurityBlobLength; __u16 ByteCount; - unsigned char SecurityBlob[1]; /* followed by */ + unsigned char SecurityBlob[]; /* followed by */ /* unsigned char * NativeOS; */ /* unsigned char * NativeLanMan; */ /* unsigned char * PrimaryDomain; */ @@ -618,7 +618,7 @@ typedef union smb_com_session_setup_andx { __le16 PasswordLength; __u32 Reserved; /* encrypt key len and offset */ __le16 ByteCount; - unsigned char AccountPassword[1]; /* followed by */ + unsigned char AccountPassword[]; /* followed by */ /* STRING AccountName */ /* STRING PrimaryDomain */ /* STRING NativeOS */ @@ -632,7 +632,7 @@ typedef union smb_com_session_setup_andx { __le16 AndXOffset; __le16 Action; /* see below */ __u16 ByteCount; - unsigned char NativeOS[1]; /* followed by */ + unsigned char NativeOS[]; /* followed by */ /* unsigned char * NativeLanMan; */ /* unsigned char * PrimaryDomain; */ } __attribute__((packed)) old_resp; /* pre-NTLM (LANMAN2.1) response */ @@ -693,7 +693,7 @@ typedef struct smb_com_tconx_req { __le16 Flags; /* see below */ __le16 PasswordLength; __le16 ByteCount; - unsigned char Password[1]; /* followed by */ + unsigned char Password[]; /* followed by */ /* STRING Path *//* \\server\share name */ /* STRING Service */ } __attribute__((packed)) TCONX_REQ; @@ -705,7 +705,7 @@ typedef struct smb_com_tconx_rsp { __le16 AndXOffset; __le16 OptionalSupport; /* see below */ __u16 ByteCount; - unsigned char Service[1]; /* always ASCII, not Unicode */ + unsigned char Service[]; /* always ASCII, not Unicode */ /* STRING NativeFileSystem */ } __attribute__((packed)) TCONX_RSP; @@ -718,7 +718,7 @@ typedef struct smb_com_tconx_rsp_ext { __le32 MaximalShareAccessRights; __le32 GuestMaximalShareAccessRights; __u16 ByteCount; - unsigned char Service[1]; /* always ASCII, not Unicode */ + unsigned char Service[]; /* always ASCII, not Unicode */ /* STRING NativeFileSystem */ } __attribute__((packed)) TCONX_RSP_EXT; @@ -755,14 +755,14 @@ typedef struct smb_com_echo_req { struct smb_hdr hdr; __le16 EchoCount; __le16 ByteCount; - char Data[1]; + char Data[]; } __attribute__((packed)) ECHO_REQ; typedef struct smb_com_echo_rsp { struct smb_hdr hdr; __le16 SequenceNumber; __le16 ByteCount; - char Data[1]; + char Data[]; } __attribute__((packed)) ECHO_RSP; typedef struct smb_com_logoff_andx_req { @@ -862,7 +862,7 @@ typedef struct smb_com_open_req { /* also handles create */ __le32 ImpersonationLevel; __u8 SecurityFlags; __le16 ByteCount; - char fileName[1]; + char fileName[]; } __attribute__((packed)) OPEN_REQ; /* open response: oplock levels */ @@ -937,7 +937,7 @@ typedef struct smb_com_openx_req { __le32 Timeout; __le32 Reserved; __le16 ByteCount; /* file name follows */ - char fileName[1]; + char fileName[]; } __attribute__((packed)) OPENX_REQ; typedef struct smb_com_openx_rsp { @@ -1085,7 +1085,7 @@ typedef struct smb_com_lock_req { __le16 NumberOfUnlocks; __le16 NumberOfLocks; __le16 ByteCount; - LOCKING_ANDX_RANGE Locks[1]; + LOCKING_ANDX_RANGE Locks[]; } __attribute__((packed)) LOCK_REQ; /* lock type */ @@ -1114,7 +1114,7 @@ typedef struct smb_com_rename_req { __le16 SearchAttributes; /* target file attributes */ __le16 ByteCount; __u8 BufferFormat; /* 4 = ASCII or Unicode */ - unsigned char OldFileName[1]; + unsigned char OldFileName[]; /* followed by __u8 BufferFormat2 */ /* followed by NewFileName */ } __attribute__((packed)) RENAME_REQ; @@ -1134,7 +1134,7 @@ typedef struct smb_com_copy_req { __le16 Flags; __le16 ByteCount; __u8 BufferFormat; /* 4 = ASCII or Unicode */ - unsigned char OldFileName[1]; + unsigned char OldFileName[]; /* followed by __u8 BufferFormat2 */ /* followed by NewFileName string */ } __attribute__((packed)) COPY_REQ; @@ -1144,7 +1144,7 @@ typedef struct smb_com_copy_rsp { __le16 CopyCount; /* number of files copied */ __u16 ByteCount; /* may be zero */ __u8 BufferFormat; /* 0x04 - only present if errored file follows */ - unsigned char ErrorFileName[1]; /* only present if error in copy */ + unsigned char ErrorFileName[]; /* only present if error in copy */ } __attribute__((packed)) COPY_RSP; #define CREATE_HARD_LINK 0x103 @@ -1158,7 +1158,7 @@ typedef struct smb_com_nt_rename_req { /* A5 - also used for create hardlink */ __le32 ClusterCount; __le16 ByteCount; __u8 BufferFormat; /* 4 = ASCII or Unicode */ - unsigned char OldFileName[1]; + unsigned char OldFileName[]; /* followed by __u8 BufferFormat2 */ /* followed by NewFileName */ } __attribute__((packed)) NT_RENAME_REQ; @@ -1173,7 +1173,7 @@ typedef struct smb_com_delete_file_req { __le16 SearchAttributes; __le16 ByteCount; __u8 BufferFormat; /* 4 = ASCII */ - unsigned char fileName[1]; + unsigned char fileName[]; } __attribute__((packed)) DELETE_FILE_REQ; typedef struct smb_com_delete_file_rsp { @@ -1185,7 +1185,7 @@ typedef struct smb_com_delete_directory_req { struct smb_hdr hdr; /* wct = 0 */ __le16 ByteCount; __u8 BufferFormat; /* 4 = ASCII */ - unsigned char DirName[1]; + unsigned char DirName[]; } __attribute__((packed)) DELETE_DIRECTORY_REQ; typedef struct smb_com_delete_directory_rsp { @@ -1197,7 +1197,7 @@ typedef struct smb_com_create_directory_req { struct smb_hdr hdr; /* wct = 0 */ __le16 ByteCount; __u8 BufferFormat; /* 4 = ASCII */ - unsigned char DirName[1]; + unsigned char DirName[]; } __attribute__((packed)) CREATE_DIRECTORY_REQ; typedef struct smb_com_create_directory_rsp { @@ -1209,7 +1209,7 @@ typedef struct smb_com_query_information_req { struct smb_hdr hdr; /* wct = 0 */ __le16 ByteCount; /* 1 + namelen + 1 */ __u8 BufferFormat; /* 4 = ASCII */ - unsigned char FileName[1]; + unsigned char FileName[]; } __attribute__((packed)) QUERY_INFORMATION_REQ; typedef struct smb_com_query_information_rsp { @@ -1229,7 +1229,7 @@ typedef struct smb_com_setattr_req { __le16 reserved[5]; /* must be zero */ __u16 ByteCount; __u8 BufferFormat; /* 4 = ASCII */ - unsigned char fileName[1]; + unsigned char fileName[]; } __attribute__((packed)) SETATTR_REQ; typedef struct smb_com_setattr_rsp { @@ -1311,7 +1311,7 @@ typedef struct smb_com_transaction_ioctl_req { __u8 IsRootFlag; /* 1 = apply command to root of share (must be DFS) */ __le16 ByteCount; __u8 Pad[3]; - __u8 Data[1]; + __u8 Data[]; } __attribute__((packed)) TRANSACT_IOCTL_REQ; typedef struct smb_com_transaction_compr_ioctl_req { @@ -1430,7 +1430,7 @@ typedef struct smb_com_transaction_change_notify_req { __u8 Reserved2; __le16 ByteCount; /* __u8 Pad[3];*/ -/* __u8 Data[1];*/ +/* __u8 Data[];*/ } __attribute__((packed)) TRANSACT_CHANGE_NOTIFY_REQ; /* BB eventually change to use generic ntransact rsp struct @@ -1519,7 +1519,7 @@ struct cifs_quota_data { __u64 space_used; __u64 soft_limit; __u64 hard_limit; - char sid[1]; /* variable size? */ + char sid[]; /* variable size? */ } __attribute__((packed)); /* quota sub commands */ @@ -1671,7 +1671,7 @@ typedef struct smb_com_transaction2_qpi_req { __u8 Pad; __le16 InformationLevel; __u32 Reserved4; - char FileName[1]; + char FileName[]; } __attribute__((packed)) TRANSACTION2_QPI_REQ; typedef struct smb_com_transaction2_qpi_rsp { @@ -1704,7 +1704,7 @@ typedef struct smb_com_transaction2_spi_req { __u16 Pad1; __le16 InformationLevel; __u32 Reserved4; - char FileName[1]; + char FileName[]; } __attribute__((packed)) TRANSACTION2_SPI_REQ; typedef struct smb_com_transaction2_spi_rsp { @@ -1809,7 +1809,7 @@ typedef struct smb_com_transaction2_ffirst_req { __le16 SearchFlags; __le16 InformationLevel; __le32 SearchStorageType; - char FileName[1]; + char FileName[]; } __attribute__((packed)) TRANSACTION2_FFIRST_REQ; typedef struct smb_com_transaction2_ffirst_rsp { @@ -2020,7 +2020,7 @@ typedef struct smb_com_transaction2_get_dfs_refer_req { perhaps?) followed by one byte pad - doesn't seem to matter though */ __le16 MaxReferralLevel; - char RequestFileName[1]; + char RequestFileName[]; } __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_REQ; #define DFS_VERSION cpu_to_le16(0x0003) @@ -2049,7 +2049,7 @@ struct get_dfs_referral_rsp { __le16 PathConsumed; __le16 NumberOfReferrals; __le32 DFSFlags; - REFERRAL3 referrals[1]; /* array of level 3 dfs_referral structures */ + REFERRAL3 referrals[]; /* array of level 3 dfs_referral structures */ /* followed by the strings pointed to by the referral structures */ } __packed; @@ -2284,7 +2284,10 @@ typedef struct { /* data block encoding of response to level 263 QPathInfo */ __le32 Mode; __le32 AlignmentRequirement; __le32 FileNameLength; - char FileName[1]; + union { + char __pad; + DECLARE_FLEX_ARRAY(char, FileName); + }; } __attribute__((packed)) FILE_ALL_INFO; /* level 0x107 QPathInfo */ typedef struct { @@ -2322,7 +2325,7 @@ typedef struct { } __attribute__((packed)) FILE_UNIX_BASIC_INFO; /* level 0x200 QPathInfo */ typedef struct { - char LinkDest[1]; + DECLARE_FLEX_ARRAY(char, LinkDest); } __attribute__((packed)) FILE_UNIX_LINK_INFO; /* level 0x201 QPathInfo */ /* The following three structures are needed only for @@ -2371,7 +2374,7 @@ struct file_end_of_file_info { } __attribute__((packed)); /* size info, level 0x104 for set, 0x106 for query */ struct file_alt_name_info { - __u8 alt_name[1]; + DECLARE_FLEX_ARRAY(__u8, alt_name); } __attribute__((packed)); /* level 0x0108 */ struct file_stream_info { @@ -2480,7 +2483,10 @@ typedef struct { __le32 NextEntryOffset; __u32 ResumeKey; /* as with FileIndex - no need to convert */ FILE_UNIX_BASIC_INFO basic; - char FileName[1]; + union { + char __pad; + DECLARE_FLEX_ARRAY(char, FileName); + }; } __attribute__((packed)) FILE_UNIX_INFO; /* level 0x202 */ typedef struct { @@ -2494,7 +2500,7 @@ typedef struct { __le64 AllocationSize; __le32 ExtFileAttributes; __le32 FileNameLength; - char FileName[1]; + char FileName[]; } __attribute__((packed)) FILE_DIRECTORY_INFO; /* level 0x101 FF resp data */ typedef struct { @@ -2509,7 +2515,7 @@ typedef struct { __le32 ExtFileAttributes; __le32 FileNameLength; __le32 EaSize; /* length of the xattrs */ - char FileName[1]; + char FileName[]; } __attribute__((packed)) FILE_FULL_DIRECTORY_INFO; /* level 0x102 rsp data */ typedef struct { @@ -2526,7 +2532,7 @@ typedef struct { __le32 EaSize; /* EA size */ __le32 Reserved; __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/ - char FileName[1]; + char FileName[]; } __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO; /* level 0x105 FF rsp data */ typedef struct { @@ -2544,7 +2550,7 @@ typedef struct { __u8 ShortNameLength; __u8 Reserved; __u8 ShortName[24]; - char FileName[1]; + char FileName[]; } __attribute__((packed)) FILE_BOTH_DIRECTORY_INFO; /* level 0x104 FFrsp data */ typedef struct { @@ -2559,7 +2565,7 @@ typedef struct { __le32 AllocationSize; __le16 Attributes; /* verify not u32 */ __u8 FileNameLength; - char FileName[1]; + char FileName[]; } __attribute__((packed)) FIND_FILE_STANDARD_INFO; /* level 0x1 FF resp data */ @@ -2569,21 +2575,11 @@ struct win_dev { __le64 minor; } __attribute__((packed)); -struct gea { - unsigned char name_len; - char name[1]; -} __attribute__((packed)); - -struct gealist { - unsigned long list_len; - struct gea list[1]; -} __attribute__((packed)); - struct fea { unsigned char EA_flags; __u8 name_len; __le16 value_len; - char name[1]; + char name[]; /* optionally followed by value */ } __attribute__((packed)); /* flags for _FEA.fEA */ @@ -2591,7 +2587,7 @@ struct fea { struct fealist { __le32 list_len; - struct fea list[1]; + struct fea list; } __attribute__((packed)); /* used to hold an arbitrary blob of data */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index b8a47704a6ef..b7a36ebd0f2f 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -244,6 +244,9 @@ extern int cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, unsigned int page_offset, unsigned int to_read); +int cifs_read_iter_from_socket(struct TCP_Server_Info *server, + struct iov_iter *iter, + unsigned int to_read); extern int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb); void cifs_mount_put_conns(struct cifs_mount_ctx *mnt_ctx); int cifs_mount_get_session(struct cifs_mount_ctx *mnt_ctx); @@ -581,10 +584,7 @@ int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid); int cifs_async_writev(struct cifs_writedata *wdata, void (*release)(struct kref *kref)); void cifs_writev_complete(struct work_struct *work); -struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages, - work_func_t complete); -struct cifs_writedata *cifs_writedata_direct_alloc(struct page **pages, - work_func_t complete); +struct cifs_writedata *cifs_writedata_alloc(work_func_t complete); void cifs_writedata_release(struct kref *refcount); int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, @@ -601,13 +601,10 @@ enum securityEnum cifs_select_sectype(struct TCP_Server_Info *, enum securityEnum); struct cifs_aio_ctx *cifs_aio_ctx_alloc(void); void cifs_aio_ctx_release(struct kref *refcount); -int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw); int cifs_alloc_hash(const char *name, struct shash_desc **sdesc); void cifs_free_hash(struct shash_desc **sdesc); -void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page, - unsigned int *len, unsigned int *offset); struct cifs_chan * cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server); int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 60dd4e37030a..a24e4ddf8043 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -25,6 +25,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/uaccess.h> #include "cifspdu.h" +#include "cifsfs.h" #include "cifsglob.h" #include "cifsacl.h" #include "cifsproto.h" @@ -1295,11 +1296,8 @@ cifs_readv_callback(struct mid_q_entry *mid) struct TCP_Server_Info *server = tcon->ses->server; struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 2, - .rq_pages = rdata->pages, - .rq_offset = rdata->page_offset, - .rq_npages = rdata->nr_pages, - .rq_pagesz = rdata->pagesz, - .rq_tailsz = rdata->tailsz }; + .rq_iter_size = iov_iter_count(&rdata->iter), + .rq_iter = rdata->iter }; struct cifs_credits credits = { .value = 1, .instance = 0 }; cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n", @@ -1738,11 +1736,8 @@ cifs_async_writev(struct cifs_writedata *wdata, rqst.rq_iov = iov; rqst.rq_nvec = 2; - rqst.rq_pages = wdata->pages; - rqst.rq_offset = wdata->page_offset; - rqst.rq_npages = wdata->nr_pages; - rqst.rq_pagesz = wdata->pagesz; - rqst.rq_tailsz = wdata->tailsz; + rqst.rq_iter = wdata->iter; + rqst.rq_iter_size = iov_iter_count(&wdata->iter); cifs_dbg(FYI, "async write at %llu %u bytes\n", wdata->offset, wdata->bytes); @@ -5373,14 +5368,15 @@ CIFSSMBSetPathInfoFB(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid fid; int rc; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_WRITE; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.disposition = FILE_OPEN; - oparms.path = fileName; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_WRITE, + .create_options = cifs_create_options(cifs_sb, 0), + .disposition = FILE_OPEN, + .path = fileName, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc) @@ -5787,7 +5783,7 @@ QAllEAsRetry: /* account for ea list len */ list_len -= 4; - temp_fea = ea_response_data->list; + temp_fea = &ea_response_data->list; temp_ptr = (char *)temp_fea; while (list_len > 0) { unsigned int name_len; @@ -5902,7 +5898,7 @@ SetEARetry: else name_len = strnlen(ea_name, 255); - count = sizeof(*parm_data) + ea_value_len + name_len; + count = sizeof(*parm_data) + 1 + ea_value_len + name_len; pSMB->MaxParameterCount = cpu_to_le16(2); /* BB find max SMB PDU from sess */ pSMB->MaxDataCount = cpu_to_le16(1000); @@ -5926,14 +5922,14 @@ SetEARetry: byte_count = 3 /* pad */ + params + count; pSMB->DataCount = cpu_to_le16(count); parm_data->list_len = cpu_to_le32(count); - parm_data->list[0].EA_flags = 0; + parm_data->list.EA_flags = 0; /* we checked above that name len is less than 255 */ - parm_data->list[0].name_len = (__u8)name_len; + parm_data->list.name_len = (__u8)name_len; /* EA names are always ASCII */ if (ea_name) - strncpy(parm_data->list[0].name, ea_name, name_len); - parm_data->list[0].name[name_len] = 0; - parm_data->list[0].value_len = cpu_to_le16(ea_value_len); + strncpy(parm_data->list.name, ea_name, name_len); + parm_data->list.name[name_len] = '\0'; + parm_data->list.value_len = cpu_to_le16(ea_value_len); /* caller ensures that ea_value_len is less than 64K but we need to ensure that it fits within the smb */ @@ -5941,7 +5937,7 @@ SetEARetry: negotiated SMB buffer size BB */ /* if (ea_value_len > buffer_size - 512 (enough for header)) */ if (ea_value_len) - memcpy(parm_data->list[0].name+name_len+1, + memcpy(parm_data->list.name + name_len + 1, ea_value, ea_value_len); pSMB->TotalDataCount = pSMB->DataCount; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index e6088d96eb04..ec020d860be3 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -79,8 +79,6 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) int len; char *unc; struct sockaddr_storage ss; - time64_t expiry, now; - unsigned long ttl = SMB_DNS_RESOLVE_INTERVAL_DEFAULT; if (!server->hostname) return -EINVAL; @@ -102,29 +100,19 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) ss = server->dstaddr; spin_unlock(&server->srv_lock); - rc = dns_resolve_server_name_to_ip(unc, (struct sockaddr *)&ss, &expiry); + rc = dns_resolve_server_name_to_ip(unc, (struct sockaddr *)&ss, NULL); kfree(unc); if (rc < 0) { cifs_dbg(FYI, "%s: failed to resolve server part of %s to IP: %d\n", __func__, server->hostname, rc); - goto requeue_resolve; + } else { + spin_lock(&server->srv_lock); + memcpy(&server->dstaddr, &ss, sizeof(server->dstaddr)); + spin_unlock(&server->srv_lock); + rc = 0; } - spin_lock(&server->srv_lock); - memcpy(&server->dstaddr, &ss, sizeof(server->dstaddr)); - spin_unlock(&server->srv_lock); - - now = ktime_get_real_seconds(); - if (expiry && expiry > now) - /* To make sure we don't use the cached entry, retry 1s */ - ttl = max_t(unsigned long, expiry - now, SMB_DNS_RESOLVE_INTERVAL_MIN) + 1; - -requeue_resolve: - cifs_dbg(FYI, "%s: next dns resolution scheduled for %lu seconds in the future\n", - __func__, ttl); - mod_delayed_work(cifsiod_wq, &server->resolve, (ttl * HZ)); - return rc; } @@ -148,26 +136,6 @@ static void smb2_query_server_interfaces(struct work_struct *work) (SMB_INTERFACE_POLL_INTERVAL * HZ)); } -static void cifs_resolve_server(struct work_struct *work) -{ - int rc; - struct TCP_Server_Info *server = container_of(work, - struct TCP_Server_Info, resolve.work); - - cifs_server_lock(server); - - /* - * Resolve the hostname again to make sure that IP address is up-to-date. - */ - rc = reconn_set_ipaddr_from_hostname(server); - if (rc) { - cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n", - __func__, rc); - } - - cifs_server_unlock(server); -} - /* * Update the tcpStatus for the server. * This is used to signal the cifsd thread to call cifs_reconnect @@ -766,6 +734,20 @@ cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, return cifs_readv_from_socket(server, &smb_msg); } +int +cifs_read_iter_from_socket(struct TCP_Server_Info *server, struct iov_iter *iter, + unsigned int to_read) +{ + struct msghdr smb_msg = { .msg_iter = *iter }; + int ret; + + iov_iter_truncate(&smb_msg.msg_iter, to_read); + ret = cifs_readv_from_socket(server, &smb_msg); + if (ret > 0) + iov_iter_advance(iter, ret); + return ret; +} + static bool is_smb_response(struct TCP_Server_Info *server, unsigned char type) { @@ -926,7 +908,6 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) spin_unlock(&server->srv_lock); cancel_delayed_work_sync(&server->echo); - cancel_delayed_work_sync(&server->resolve); spin_lock(&server->srv_lock); server->tcpStatus = CifsExiting; @@ -1550,7 +1531,6 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) cifs_put_tcp_session(server->primary_server, from_reconnect); cancel_delayed_work_sync(&server->echo); - cancel_delayed_work_sync(&server->resolve); if (from_reconnect) /* @@ -1656,7 +1636,6 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); INIT_LIST_HEAD(&tcp_ses->smb_ses_list); INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request); - INIT_DELAYED_WORK(&tcp_ses->resolve, cifs_resolve_server); INIT_DELAYED_WORK(&tcp_ses->reconnect, smb2_reconnect_server); mutex_init(&tcp_ses->reconnect_mutex); #ifdef CONFIG_CIFS_DFS_UPCALL @@ -1745,12 +1724,6 @@ smbd_connected: /* queue echo request delayed work */ queue_delayed_work(cifsiod_wq, &tcp_ses->echo, tcp_ses->echo_interval); - /* queue dns resolution delayed work */ - cifs_dbg(FYI, "%s: next dns resolution scheduled for %d seconds in the future\n", - __func__, SMB_DNS_RESOLVE_INTERVAL_DEFAULT); - - queue_delayed_work(cifsiod_wq, &tcp_ses->resolve, (SMB_DNS_RESOLVE_INTERVAL_DEFAULT * HZ)); - return tcp_ses; out_err_crypto_release: @@ -2844,72 +2817,48 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) * negprot - BB check reconnection in case where second * sessinit is sent but no second negprot */ - struct rfc1002_session_packet *ses_init_buf; - unsigned int req_noscope_len; - struct smb_hdr *smb_buf; + struct rfc1002_session_packet req = {}; + struct smb_hdr *smb_buf = (struct smb_hdr *)&req; + unsigned int len; - ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet), - GFP_KERNEL); + req.trailer.session_req.called_len = sizeof(req.trailer.session_req.called_name); - if (ses_init_buf) { - ses_init_buf->trailer.session_req.called_len = 32; + if (server->server_RFC1001_name[0] != 0) + rfc1002mangle(req.trailer.session_req.called_name, + server->server_RFC1001_name, + RFC1001_NAME_LEN_WITH_NULL); + else + rfc1002mangle(req.trailer.session_req.called_name, + DEFAULT_CIFS_CALLED_NAME, + RFC1001_NAME_LEN_WITH_NULL); - if (server->server_RFC1001_name[0] != 0) - rfc1002mangle(ses_init_buf->trailer. - session_req.called_name, - server->server_RFC1001_name, - RFC1001_NAME_LEN_WITH_NULL); - else - rfc1002mangle(ses_init_buf->trailer. - session_req.called_name, - DEFAULT_CIFS_CALLED_NAME, - RFC1001_NAME_LEN_WITH_NULL); + req.trailer.session_req.calling_len = sizeof(req.trailer.session_req.calling_name); - ses_init_buf->trailer.session_req.calling_len = 32; + /* calling name ends in null (byte 16) from old smb convention */ + if (server->workstation_RFC1001_name[0] != 0) + rfc1002mangle(req.trailer.session_req.calling_name, + server->workstation_RFC1001_name, + RFC1001_NAME_LEN_WITH_NULL); + else + rfc1002mangle(req.trailer.session_req.calling_name, + "LINUX_CIFS_CLNT", + RFC1001_NAME_LEN_WITH_NULL); - /* - * calling name ends in null (byte 16) from old smb - * convention. - */ - if (server->workstation_RFC1001_name[0] != 0) - rfc1002mangle(ses_init_buf->trailer. - session_req.calling_name, - server->workstation_RFC1001_name, - RFC1001_NAME_LEN_WITH_NULL); - else - rfc1002mangle(ses_init_buf->trailer. - session_req.calling_name, - "LINUX_CIFS_CLNT", - RFC1001_NAME_LEN_WITH_NULL); - - ses_init_buf->trailer.session_req.scope1 = 0; - ses_init_buf->trailer.session_req.scope2 = 0; - smb_buf = (struct smb_hdr *)ses_init_buf; - - /* sizeof RFC1002_SESSION_REQUEST with no scopes */ - req_noscope_len = sizeof(struct rfc1002_session_packet) - 2; - - /* == cpu_to_be32(0x81000044) */ - smb_buf->smb_buf_length = - cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | req_noscope_len); - rc = smb_send(server, smb_buf, 0x44); - kfree(ses_init_buf); - /* - * RFC1001 layer in at least one server - * requires very short break before negprot - * presumably because not expecting negprot - * to follow so fast. This is a simple - * solution that works without - * complicating the code and causes no - * significant slowing down on mount - * for everyone else - */ - usleep_range(1000, 2000); - } /* - * else the negprot may still work without this - * even though malloc failed + * As per rfc1002, @len must be the number of bytes that follows the + * length field of a rfc1002 session request payload. */ + len = sizeof(req) - offsetof(struct rfc1002_session_packet, trailer.session_req); + + smb_buf->smb_buf_length = cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | len); + rc = smb_send(server, smb_buf, len); + /* + * RFC1001 layer in at least one server requires very short break before + * negprot presumably because not expecting negprot to follow so fast. + * This is a simple solution that works without complicating the code + * and causes no significant slowing down on mount for everyone else + */ + usleep_range(1000, 2000); return rc; } @@ -3760,16 +3709,12 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, struct nls_table *nls_info) { int rc = -ENOSYS; - struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr; - struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; + struct TCP_Server_Info *pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&pserver->dstaddr; + struct sockaddr_in *addr = (struct sockaddr_in *)&pserver->dstaddr; bool is_binding = false; spin_lock(&ses->ses_lock); - if (server->dstaddr.ss_family == AF_INET6) - scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr); - else - scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr); - if (ses->ses_status != SES_GOOD && ses->ses_status != SES_NEW && ses->ses_status != SES_NEED_RECON) { @@ -3793,6 +3738,14 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, ses->ses_status = SES_IN_SETUP; spin_unlock(&ses->ses_lock); + /* update ses ip_addr only for primary chan */ + if (server == pserver) { + if (server->dstaddr.ss_family == AF_INET6) + scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr); + else + scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr); + } + if (!is_binding) { ses->capabilities = server->capabilities; if (!linuxExtEnabled) diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 2b6076324ffc..30b1e1bfd204 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -304,15 +304,16 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned if (!tcon->unix_ext && (mode & S_IWUGO) == 0) create_options |= CREATE_OPTION_READONLY; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = desired_access; - oparms.create_options = cifs_create_options(cifs_sb, create_options); - oparms.disposition = disposition; - oparms.path = full_path; - oparms.fid = fid; - oparms.reconnect = false; - oparms.mode = mode; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = desired_access, + .create_options = cifs_create_options(cifs_sb, create_options), + .disposition = disposition, + .path = full_path, + .fid = fid, + .mode = mode, + }; rc = server->ops->open(xid, &oparms, oplock, buf); if (rc) { cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 2870e3b6ffe8..0e602173ac76 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -38,6 +38,125 @@ #include "cached_dir.h" /* + * Remove the dirty flags from a span of pages. + */ +static void cifs_undirty_folios(struct inode *inode, loff_t start, unsigned int len) +{ + struct address_space *mapping = inode->i_mapping; + struct folio *folio; + pgoff_t end; + + XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE); + + rcu_read_lock(); + + end = (start + len - 1) / PAGE_SIZE; + xas_for_each_marked(&xas, folio, end, PAGECACHE_TAG_DIRTY) { + xas_pause(&xas); + rcu_read_unlock(); + folio_lock(folio); + folio_clear_dirty_for_io(folio); + folio_unlock(folio); + rcu_read_lock(); + } + + rcu_read_unlock(); +} + +/* + * Completion of write to server. + */ +void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len) +{ + struct address_space *mapping = inode->i_mapping; + struct folio *folio; + pgoff_t end; + + XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE); + + if (!len) + return; + + rcu_read_lock(); + + end = (start + len - 1) / PAGE_SIZE; + xas_for_each(&xas, folio, end) { + if (!folio_test_writeback(folio)) { + WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", + len, start, folio_index(folio), end); + continue; + } + + folio_detach_private(folio); + folio_end_writeback(folio); + } + + rcu_read_unlock(); +} + +/* + * Failure of write to server. + */ +void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len) +{ + struct address_space *mapping = inode->i_mapping; + struct folio *folio; + pgoff_t end; + + XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE); + + if (!len) + return; + + rcu_read_lock(); + + end = (start + len - 1) / PAGE_SIZE; + xas_for_each(&xas, folio, end) { + if (!folio_test_writeback(folio)) { + WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", + len, start, folio_index(folio), end); + continue; + } + + folio_set_error(folio); + folio_end_writeback(folio); + } + + rcu_read_unlock(); +} + +/* + * Redirty pages after a temporary failure. + */ +void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int len) +{ + struct address_space *mapping = inode->i_mapping; + struct folio *folio; + pgoff_t end; + + XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE); + + if (!len) + return; + + rcu_read_lock(); + + end = (start + len - 1) / PAGE_SIZE; + xas_for_each(&xas, folio, end) { + if (!folio_test_writeback(folio)) { + WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", + len, start, folio_index(folio), end); + continue; + } + + filemap_dirty_folio(folio->mapping, folio); + folio_end_writeback(folio); + } + + rcu_read_unlock(); +} + +/* * Mark as invalid, all open files on tree connections since they * were closed when session to server was lost. */ @@ -261,14 +380,15 @@ static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_ if (f_flags & O_DIRECT) create_options |= CREATE_NO_BUFFER; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = desired_access; - oparms.create_options = cifs_create_options(cifs_sb, create_options); - oparms.disposition = disposition; - oparms.path = full_path; - oparms.fid = fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = desired_access, + .create_options = cifs_create_options(cifs_sb, create_options), + .disposition = disposition, + .path = full_path, + .fid = fid, + }; rc = server->ops->open(xid, &oparms, oplock, buf); if (rc) @@ -849,14 +969,16 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) if (server->ops->get_lease_key) server->ops->get_lease_key(inode, &cfile->fid); - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = desired_access; - oparms.create_options = cifs_create_options(cifs_sb, create_options); - oparms.disposition = disposition; - oparms.path = full_path; - oparms.fid = &cfile->fid; - oparms.reconnect = true; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = desired_access, + .create_options = cifs_create_options(cifs_sb, create_options), + .disposition = disposition, + .path = full_path, + .fid = &cfile->fid, + .reconnect = true, + }; /* * Can not refresh inode by passing in file_info buf to be returned by @@ -2296,7 +2418,6 @@ cifs_writedata_release(struct kref *refcount) if (wdata->cfile) cifsFileInfo_put(wdata->cfile); - kvfree(wdata->pages); kfree(wdata); } @@ -2307,51 +2428,49 @@ cifs_writedata_release(struct kref *refcount) static void cifs_writev_requeue(struct cifs_writedata *wdata) { - int i, rc = 0; + int rc = 0; struct inode *inode = d_inode(wdata->cfile->dentry); struct TCP_Server_Info *server; - unsigned int rest_len; + unsigned int rest_len = wdata->bytes; + loff_t fpos = wdata->offset; server = tlink_tcon(wdata->cfile->tlink)->ses->server; - i = 0; - rest_len = wdata->bytes; do { struct cifs_writedata *wdata2; - unsigned int j, nr_pages, wsize, tailsz, cur_len; + unsigned int wsize, cur_len; wsize = server->ops->wp_retry_size(inode); if (wsize < rest_len) { - nr_pages = wsize / PAGE_SIZE; - if (!nr_pages) { + if (wsize < PAGE_SIZE) { rc = -EOPNOTSUPP; break; } - cur_len = nr_pages * PAGE_SIZE; - tailsz = PAGE_SIZE; + cur_len = min(round_down(wsize, PAGE_SIZE), rest_len); } else { - nr_pages = DIV_ROUND_UP(rest_len, PAGE_SIZE); cur_len = rest_len; - tailsz = rest_len - (nr_pages - 1) * PAGE_SIZE; } - wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete); + wdata2 = cifs_writedata_alloc(cifs_writev_complete); if (!wdata2) { rc = -ENOMEM; break; } - for (j = 0; j < nr_pages; j++) { - wdata2->pages[j] = wdata->pages[i + j]; - lock_page(wdata2->pages[j]); - clear_page_dirty_for_io(wdata2->pages[j]); - } - wdata2->sync_mode = wdata->sync_mode; - wdata2->nr_pages = nr_pages; - wdata2->offset = page_offset(wdata2->pages[0]); - wdata2->pagesz = PAGE_SIZE; - wdata2->tailsz = tailsz; - wdata2->bytes = cur_len; + wdata2->offset = fpos; + wdata2->bytes = cur_len; + wdata2->iter = wdata->iter; + + iov_iter_advance(&wdata2->iter, fpos - wdata->offset); + iov_iter_truncate(&wdata2->iter, wdata2->bytes); + + if (iov_iter_is_xarray(&wdata2->iter)) + /* Check for pages having been redirtied and clean + * them. We can do this by walking the xarray. If + * it's not an xarray, then it's a DIO and we shouldn't + * be mucking around with the page bits. + */ + cifs_undirty_folios(inode, fpos, cur_len); rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &wdata2->cfile); @@ -2366,33 +2485,22 @@ cifs_writev_requeue(struct cifs_writedata *wdata) cifs_writedata_release); } - for (j = 0; j < nr_pages; j++) { - unlock_page(wdata2->pages[j]); - if (rc != 0 && !is_retryable_error(rc)) { - SetPageError(wdata2->pages[j]); - end_page_writeback(wdata2->pages[j]); - put_page(wdata2->pages[j]); - } - } - kref_put(&wdata2->refcount, cifs_writedata_release); if (rc) { if (is_retryable_error(rc)) continue; - i += nr_pages; + fpos += cur_len; + rest_len -= cur_len; break; } + fpos += cur_len; rest_len -= cur_len; - i += nr_pages; - } while (i < wdata->nr_pages); + } while (rest_len > 0); - /* cleanup remaining pages from the original wdata */ - for (; i < wdata->nr_pages; i++) { - SetPageError(wdata->pages[i]); - end_page_writeback(wdata->pages[i]); - put_page(wdata->pages[i]); - } + /* Clean up remaining pages from the original wdata */ + if (iov_iter_is_xarray(&wdata->iter)) + cifs_pages_write_failed(inode, fpos, rest_len); if (rc != 0 && !is_retryable_error(rc)) mapping_set_error(inode->i_mapping, rc); @@ -2405,7 +2513,6 @@ cifs_writev_complete(struct work_struct *work) struct cifs_writedata *wdata = container_of(work, struct cifs_writedata, work); struct inode *inode = d_inode(wdata->cfile->dentry); - int i = 0; if (wdata->result == 0) { spin_lock(&inode->i_lock); @@ -2416,45 +2523,24 @@ cifs_writev_complete(struct work_struct *work) } else if (wdata->sync_mode == WB_SYNC_ALL && wdata->result == -EAGAIN) return cifs_writev_requeue(wdata); - for (i = 0; i < wdata->nr_pages; i++) { - struct page *page = wdata->pages[i]; + if (wdata->result == -EAGAIN) + cifs_pages_write_redirty(inode, wdata->offset, wdata->bytes); + else if (wdata->result < 0) + cifs_pages_write_failed(inode, wdata->offset, wdata->bytes); + else + cifs_pages_written_back(inode, wdata->offset, wdata->bytes); - if (wdata->result == -EAGAIN) - __set_page_dirty_nobuffers(page); - else if (wdata->result < 0) - SetPageError(page); - end_page_writeback(page); - cifs_readpage_to_fscache(inode, page); - put_page(page); - } if (wdata->result != -EAGAIN) mapping_set_error(inode->i_mapping, wdata->result); kref_put(&wdata->refcount, cifs_writedata_release); } -struct cifs_writedata * -cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete) -{ - struct cifs_writedata *writedata = NULL; - struct page **pages = - kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (pages) { - writedata = cifs_writedata_direct_alloc(pages, complete); - if (!writedata) - kvfree(pages); - } - - return writedata; -} - -struct cifs_writedata * -cifs_writedata_direct_alloc(struct page **pages, work_func_t complete) +struct cifs_writedata *cifs_writedata_alloc(work_func_t complete) { struct cifs_writedata *wdata; wdata = kzalloc(sizeof(*wdata), GFP_NOFS); if (wdata != NULL) { - wdata->pages = pages; kref_init(&wdata->refcount); INIT_LIST_HEAD(&wdata->list); init_completion(&wdata->done); @@ -2463,7 +2549,6 @@ cifs_writedata_direct_alloc(struct page **pages, work_func_t complete) return wdata; } - static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) { struct address_space *mapping = page->mapping; @@ -2522,310 +2607,372 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) return rc; } -static struct cifs_writedata * -wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping, - pgoff_t end, pgoff_t *index, - unsigned int *found_pages) +/* + * Extend the region to be written back to include subsequent contiguously + * dirty pages if possible, but don't sleep while doing so. + */ +static void cifs_extend_writeback(struct address_space *mapping, + long *_count, + loff_t start, + int max_pages, + size_t max_len, + unsigned int *_len) { - struct cifs_writedata *wdata; - - wdata = cifs_writedata_alloc((unsigned int)tofind, - cifs_writev_complete); - if (!wdata) - return NULL; - - *found_pages = find_get_pages_range_tag(mapping, index, end, - PAGECACHE_TAG_DIRTY, tofind, wdata->pages); - return wdata; -} + struct folio_batch batch; + struct folio *folio; + unsigned int psize, nr_pages; + size_t len = *_len; + pgoff_t index = (start + len) / PAGE_SIZE; + bool stop = true; + unsigned int i; + XA_STATE(xas, &mapping->i_pages, index); -static unsigned int -wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages, - struct address_space *mapping, - struct writeback_control *wbc, - pgoff_t end, pgoff_t *index, pgoff_t *next, bool *done) -{ - unsigned int nr_pages = 0, i; - struct page *page; + folio_batch_init(&batch); - for (i = 0; i < found_pages; i++) { - page = wdata->pages[i]; - /* - * At this point we hold neither the i_pages lock nor the - * page lock: the page may be truncated or invalidated - * (changing page->mapping to NULL), or even swizzled - * back from swapper_space to tmpfs file mapping + do { + /* Firstly, we gather up a batch of contiguous dirty pages + * under the RCU read lock - but we can't clear the dirty flags + * there if any of those pages are mapped. */ + rcu_read_lock(); - if (nr_pages == 0) - lock_page(page); - else if (!trylock_page(page)) - break; - - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - break; - } + xas_for_each(&xas, folio, ULONG_MAX) { + stop = true; + if (xas_retry(&xas, folio)) + continue; + if (xa_is_value(folio)) + break; + if (folio_index(folio) != index) + break; + if (!folio_try_get_rcu(folio)) { + xas_reset(&xas); + continue; + } + nr_pages = folio_nr_pages(folio); + if (nr_pages > max_pages) + break; - if (!wbc->range_cyclic && page->index > end) { - *done = true; - unlock_page(page); - break; - } + /* Has the page moved or been split? */ + if (unlikely(folio != xas_reload(&xas))) { + folio_put(folio); + break; + } - if (*next && (page->index != *next)) { - /* Not next consecutive page */ - unlock_page(page); - break; - } + if (!folio_trylock(folio)) { + folio_put(folio); + break; + } + if (!folio_test_dirty(folio) || folio_test_writeback(folio)) { + folio_unlock(folio); + folio_put(folio); + break; + } - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); + max_pages -= nr_pages; + psize = folio_size(folio); + len += psize; + stop = false; + if (max_pages <= 0 || len >= max_len || *_count <= 0) + stop = true; - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); - break; + index += nr_pages; + if (!folio_batch_add(&batch, folio)) + break; + if (stop) + break; } - /* - * This actually clears the dirty bit in the radix tree. - * See cifs_writepage() for more commentary. + if (!stop) + xas_pause(&xas); + rcu_read_unlock(); + + /* Now, if we obtained any pages, we can shift them to being + * writable and mark them for caching. */ - set_page_writeback(page); - if (page_offset(page) >= i_size_read(mapping->host)) { - *done = true; - unlock_page(page); - end_page_writeback(page); + if (!folio_batch_count(&batch)) break; - } - wdata->pages[i] = page; - *next = page->index + 1; - ++nr_pages; - } - - /* reset index to refind any pages skipped */ - if (nr_pages == 0) - *index = wdata->pages[0]->index + 1; - - /* put any pages we aren't going to use */ - for (i = nr_pages; i < found_pages; i++) { - put_page(wdata->pages[i]); - wdata->pages[i] = NULL; - } - - return nr_pages; -} - -static int -wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, - struct address_space *mapping, struct writeback_control *wbc) -{ - int rc; - - wdata->sync_mode = wbc->sync_mode; - wdata->nr_pages = nr_pages; - wdata->offset = page_offset(wdata->pages[0]); - wdata->pagesz = PAGE_SIZE; - wdata->tailsz = min(i_size_read(mapping->host) - - page_offset(wdata->pages[nr_pages - 1]), - (loff_t)PAGE_SIZE); - wdata->bytes = ((nr_pages - 1) * PAGE_SIZE) + wdata->tailsz; - wdata->pid = wdata->cfile->pid; - - rc = adjust_credits(wdata->server, &wdata->credits, wdata->bytes); - if (rc) - return rc; - - if (wdata->cfile->invalidHandle) - rc = -EAGAIN; - else - rc = wdata->server->ops->async_writev(wdata, - cifs_writedata_release); - - return rc; -} + for (i = 0; i < folio_batch_count(&batch); i++) { + folio = batch.folios[i]; + /* The folio should be locked, dirty and not undergoing + * writeback from the loop above. + */ + if (!folio_clear_dirty_for_io(folio)) + WARN_ON(1); + if (folio_start_writeback(folio)) + WARN_ON(1); -static int -cifs_writepage_locked(struct page *page, struct writeback_control *wbc); + *_count -= folio_nr_pages(folio); + folio_unlock(folio); + } -static int cifs_write_one_page(struct page *page, struct writeback_control *wbc, - void *data) -{ - struct address_space *mapping = data; - int ret; + folio_batch_release(&batch); + cond_resched(); + } while (!stop); - ret = cifs_writepage_locked(page, wbc); - unlock_page(page); - mapping_set_error(mapping, ret); - return ret; + *_len = len; } -static int cifs_writepages(struct address_space *mapping, - struct writeback_control *wbc) +/* + * Write back the locked page and any subsequent non-locked dirty pages. + */ +static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping, + struct writeback_control *wbc, + struct folio *folio, + loff_t start, loff_t end) { struct inode *inode = mapping->host; - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct TCP_Server_Info *server; - bool done = false, scanned = false, range_whole = false; - pgoff_t end, index; struct cifs_writedata *wdata; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_credits credits_on_stack; + struct cifs_credits *credits = &credits_on_stack; struct cifsFileInfo *cfile = NULL; - int rc = 0; - int saved_rc = 0; - unsigned int xid; + unsigned int xid, wsize, len; + loff_t i_size = i_size_read(inode); + size_t max_len; + long count = wbc->nr_to_write; + int rc; - /* - * If wsize is smaller than the page cache size, default to writing - * one page at a time. - */ - if (cifs_sb->ctx->wsize < PAGE_SIZE) - return write_cache_pages(mapping, wbc, cifs_write_one_page, - mapping); + /* The folio should be locked, dirty and not undergoing writeback. */ + if (folio_start_writeback(folio)) + WARN_ON(1); + + count -= folio_nr_pages(folio); + len = folio_size(folio); xid = get_xid(); - if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ - end = -1; - } else { - index = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = true; - scanned = true; - } server = cifs_pick_channel(cifs_sb_master_tcon(cifs_sb)->ses); -retry: - while (!done && index <= end) { - unsigned int i, nr_pages, found_pages, wsize; - pgoff_t next = 0, tofind, saved_index = index; - struct cifs_credits credits_on_stack; - struct cifs_credits *credits = &credits_on_stack; - int get_file_rc = 0; + rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &cfile); + if (rc) { + cifs_dbg(VFS, "No writable handle in writepages rc=%d\n", rc); + goto err_xid; + } - if (cfile) - cifsFileInfo_put(cfile); + rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->wsize, + &wsize, credits); + if (rc != 0) + goto err_close; - rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &cfile); + wdata = cifs_writedata_alloc(cifs_writev_complete); + if (!wdata) { + rc = -ENOMEM; + goto err_uncredit; + } - /* in case of an error store it to return later */ - if (rc) - get_file_rc = rc; + wdata->sync_mode = wbc->sync_mode; + wdata->offset = folio_pos(folio); + wdata->pid = cfile->pid; + wdata->credits = credits_on_stack; + wdata->cfile = cfile; + wdata->server = server; + cfile = NULL; + + /* Find all consecutive lockable dirty pages, stopping when we find a + * page that is not immediately lockable, is not dirty or is missing, + * or we reach the end of the range. + */ + if (start < i_size) { + /* Trim the write to the EOF; the extra data is ignored. Also + * put an upper limit on the size of a single storedata op. + */ + max_len = wsize; + max_len = min_t(unsigned long long, max_len, end - start + 1); + max_len = min_t(unsigned long long, max_len, i_size - start); - rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->wsize, - &wsize, credits); - if (rc != 0) { - done = true; - break; - } + if (len < max_len) { + int max_pages = INT_MAX; - tofind = min((wsize / PAGE_SIZE) - 1, end - index) + 1; +#ifdef CONFIG_CIFS_SMB_DIRECT + if (server->smbd_conn) + max_pages = server->smbd_conn->max_frmr_depth; +#endif + max_pages -= folio_nr_pages(folio); - wdata = wdata_alloc_and_fillpages(tofind, mapping, end, &index, - &found_pages); - if (!wdata) { - rc = -ENOMEM; - done = true; - add_credits_and_wake_if(server, credits, 0); - break; + if (max_pages > 0) + cifs_extend_writeback(mapping, &count, start, + max_pages, max_len, &len); } + len = min_t(loff_t, len, max_len); + } - if (found_pages == 0) { - kref_put(&wdata->refcount, cifs_writedata_release); - add_credits_and_wake_if(server, credits, 0); - break; - } + wdata->bytes = len; + + /* We now have a contiguous set of dirty pages, each with writeback + * set; the first page is still locked at this point, but all the rest + * have been unlocked. + */ + folio_unlock(folio); + + if (start < i_size) { + iov_iter_xarray(&wdata->iter, ITER_SOURCE, &mapping->i_pages, + start, len); - nr_pages = wdata_prepare_pages(wdata, found_pages, mapping, wbc, - end, &index, &next, &done); + rc = adjust_credits(wdata->server, &wdata->credits, wdata->bytes); + if (rc) + goto err_wdata; - /* nothing to write? */ - if (nr_pages == 0) { + if (wdata->cfile->invalidHandle) + rc = -EAGAIN; + else + rc = wdata->server->ops->async_writev(wdata, + cifs_writedata_release); + if (rc >= 0) { kref_put(&wdata->refcount, cifs_writedata_release); - add_credits_and_wake_if(server, credits, 0); - continue; + goto err_close; } + } else { + /* The dirty region was entirely beyond the EOF. */ + cifs_pages_written_back(inode, start, len); + rc = 0; + } - wdata->credits = credits_on_stack; - wdata->cfile = cfile; - wdata->server = server; - cfile = NULL; +err_wdata: + kref_put(&wdata->refcount, cifs_writedata_release); +err_uncredit: + add_credits_and_wake_if(server, credits, 0); +err_close: + if (cfile) + cifsFileInfo_put(cfile); +err_xid: + free_xid(xid); + if (rc == 0) { + wbc->nr_to_write = count; + } else if (is_retryable_error(rc)) { + cifs_pages_write_redirty(inode, start, len); + } else { + cifs_pages_write_failed(inode, start, len); + mapping_set_error(mapping, rc); + } + /* Indication to update ctime and mtime as close is deferred */ + set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags); + return rc; +} - if (!wdata->cfile) { - cifs_dbg(VFS, "No writable handle in writepages rc=%d\n", - get_file_rc); - if (is_retryable_error(get_file_rc)) - rc = get_file_rc; - else - rc = -EBADF; - } else - rc = wdata_send_pages(wdata, nr_pages, mapping, wbc); +/* + * write a region of pages back to the server + */ +static int cifs_writepages_region(struct address_space *mapping, + struct writeback_control *wbc, + loff_t start, loff_t end, loff_t *_next) +{ + struct folio *folio; + struct page *head_page; + ssize_t ret; + int n, skips = 0; - for (i = 0; i < nr_pages; ++i) - unlock_page(wdata->pages[i]); + do { + pgoff_t index = start / PAGE_SIZE; - /* send failure -- clean up the mess */ - if (rc != 0) { - add_credits_and_wake_if(server, &wdata->credits, 0); - for (i = 0; i < nr_pages; ++i) { - if (is_retryable_error(rc)) - redirty_page_for_writepage(wbc, - wdata->pages[i]); - else - SetPageError(wdata->pages[i]); - end_page_writeback(wdata->pages[i]); - put_page(wdata->pages[i]); + n = find_get_pages_range_tag(mapping, &index, end / PAGE_SIZE, + PAGECACHE_TAG_DIRTY, 1, &head_page); + if (!n) + break; + + folio = page_folio(head_page); + start = folio_pos(folio); /* May regress with THPs */ + + /* At this point we hold neither the i_pages lock nor the + * page lock: the page may be truncated or invalidated + * (changing page->mapping to NULL), or even swizzled + * back from swapper_space to tmpfs file mapping + */ + if (wbc->sync_mode != WB_SYNC_NONE) { + ret = folio_lock_killable(folio); + if (ret < 0) { + folio_put(folio); + return ret; + } + } else { + if (!folio_trylock(folio)) { + folio_put(folio); + return 0; } - if (!is_retryable_error(rc)) - mapping_set_error(mapping, rc); } - kref_put(&wdata->refcount, cifs_writedata_release); - if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN) { - index = saved_index; + if (folio_mapping(folio) != mapping || + !folio_test_dirty(folio)) { + start += folio_size(folio); + folio_unlock(folio); + folio_put(folio); continue; } - /* Return immediately if we received a signal during writing */ - if (is_interrupt_error(rc)) { - done = true; - break; + if (folio_test_writeback(folio) || + folio_test_fscache(folio)) { + folio_unlock(folio); + if (wbc->sync_mode != WB_SYNC_NONE) { + folio_wait_writeback(folio); +#ifdef CONFIG_CIFS_FSCACHE + folio_wait_fscache(folio); +#endif + } else { + start += folio_size(folio); + } + folio_put(folio); + if (wbc->sync_mode == WB_SYNC_NONE) { + if (skips >= 5 || need_resched()) + break; + skips++; + } + continue; } - if (rc != 0 && saved_rc == 0) - saved_rc = rc; + if (!folio_clear_dirty_for_io(folio)) + /* We hold the page lock - it should've been dirty. */ + WARN_ON(1); - wbc->nr_to_write -= nr_pages; - if (wbc->nr_to_write <= 0) - done = true; + ret = cifs_write_back_from_locked_folio(mapping, wbc, folio, start, end); + folio_put(folio); + if (ret < 0) + return ret; - index = next; - } + start += ret; + cond_resched(); + } while (wbc->nr_to_write > 0); - if (!scanned && !done) { - /* - * We hit the last page and there is more work to be done: wrap - * back to the start of the file - */ - scanned = true; - index = 0; - goto retry; - } + *_next = start; + return 0; +} - if (saved_rc != 0) - rc = saved_rc; +/* + * Write some of the pending data back to the server + */ +static int cifs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + loff_t start, next; + int ret; - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = index; + /* We have to be careful as we can end up racing with setattr() + * truncating the pagecache since the caller doesn't take a lock here + * to prevent it. + */ - if (cfile) - cifsFileInfo_put(cfile); - free_xid(xid); - /* Indication to update ctime and mtime as close is deferred */ - set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags); - return rc; + if (wbc->range_cyclic) { + start = mapping->writeback_index * PAGE_SIZE; + ret = cifs_writepages_region(mapping, wbc, start, LLONG_MAX, &next); + if (ret == 0) { + mapping->writeback_index = next / PAGE_SIZE; + if (start > 0 && wbc->nr_to_write > 0) { + ret = cifs_writepages_region(mapping, wbc, 0, + start, &next); + if (ret == 0) + mapping->writeback_index = + next / PAGE_SIZE; + } + } + } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { + ret = cifs_writepages_region(mapping, wbc, 0, LLONG_MAX, &next); + if (wbc->nr_to_write > 0 && ret == 0) + mapping->writeback_index = next / PAGE_SIZE; + } else { + ret = cifs_writepages_region(mapping, wbc, + wbc->range_start, wbc->range_end, &next); + } + + return ret; } static int @@ -2877,6 +3024,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct cifsFileInfo *cfile = file->private_data; struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); + struct folio *folio = page_folio(page); __u32 pid; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) @@ -2887,14 +3035,14 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, cifs_dbg(FYI, "write_end for page %p from pos %lld with %d bytes\n", page, pos, copied); - if (PageChecked(page)) { + if (folio_test_checked(folio)) { if (copied == len) - SetPageUptodate(page); - ClearPageChecked(page); - } else if (!PageUptodate(page) && copied == PAGE_SIZE) - SetPageUptodate(page); + folio_mark_uptodate(folio); + folio_clear_checked(folio); + } else if (!folio_test_uptodate(folio) && copied == PAGE_SIZE) + folio_mark_uptodate(folio); - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { char *page_data; unsigned offset = pos & (PAGE_SIZE - 1); unsigned int xid; @@ -3054,57 +3202,13 @@ int cifs_flush(struct file *file, fl_owner_t id) return rc; } -static int -cifs_write_allocate_pages(struct page **pages, unsigned long num_pages) -{ - int rc = 0; - unsigned long i; - - for (i = 0; i < num_pages; i++) { - pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); - if (!pages[i]) { - /* - * save number of pages we have already allocated and - * return with ENOMEM error - */ - num_pages = i; - rc = -ENOMEM; - break; - } - } - - if (rc) { - for (i = 0; i < num_pages; i++) - put_page(pages[i]); - } - return rc; -} - -static inline -size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len) -{ - size_t num_pages; - size_t clen; - - clen = min_t(const size_t, len, wsize); - num_pages = DIV_ROUND_UP(clen, PAGE_SIZE); - - if (cur_len) - *cur_len = clen; - - return num_pages; -} - static void cifs_uncached_writedata_release(struct kref *refcount) { - int i; struct cifs_writedata *wdata = container_of(refcount, struct cifs_writedata, refcount); kref_put(&wdata->ctx->refcount, cifs_aio_ctx_release); - for (i = 0; i < wdata->nr_pages; i++) - put_page(wdata->pages[i]); cifs_writedata_release(refcount); } @@ -3131,48 +3235,6 @@ cifs_uncached_writev_complete(struct work_struct *work) } static int -wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from, - size_t *len, unsigned long *num_pages) -{ - size_t save_len, copied, bytes, cur_len = *len; - unsigned long i, nr_pages = *num_pages; - - save_len = cur_len; - for (i = 0; i < nr_pages; i++) { - bytes = min_t(const size_t, cur_len, PAGE_SIZE); - copied = copy_page_from_iter(wdata->pages[i], 0, bytes, from); - cur_len -= copied; - /* - * If we didn't copy as much as we expected, then that - * may mean we trod into an unmapped area. Stop copying - * at that point. On the next pass through the big - * loop, we'll likely end up getting a zero-length - * write and bailing out of it. - */ - if (copied < bytes) - break; - } - cur_len = save_len - cur_len; - *len = cur_len; - - /* - * If we have no data to send, then that probably means that - * the copy above failed altogether. That's most likely because - * the address in the iovec was bogus. Return -EFAULT and let - * the caller free anything we allocated and bail out. - */ - if (!cur_len) - return -EFAULT; - - /* - * i + 1 now represents the number of pages we actually used in - * the copy phase above. - */ - *num_pages = i + 1; - return 0; -} - -static int cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list, struct cifs_aio_ctx *ctx) { @@ -3242,23 +3304,57 @@ fail: return rc; } +/* + * Select span of a bvec iterator we're going to use. Limit it by both maximum + * size and maximum number of segments. + */ +static size_t cifs_limit_bvec_subset(const struct iov_iter *iter, size_t max_size, + size_t max_segs, unsigned int *_nsegs) +{ + const struct bio_vec *bvecs = iter->bvec; + unsigned int nbv = iter->nr_segs, ix = 0, nsegs = 0; + size_t len, span = 0, n = iter->count; + size_t skip = iter->iov_offset; + + if (WARN_ON(!iov_iter_is_bvec(iter)) || n == 0) + return 0; + + while (n && ix < nbv && skip) { + len = bvecs[ix].bv_len; + if (skip < len) + break; + skip -= len; + n -= len; + ix++; + } + + while (n && ix < nbv) { + len = min3(n, bvecs[ix].bv_len - skip, max_size); + span += len; + nsegs++; + ix++; + if (span >= max_size || nsegs >= max_segs) + break; + skip = 0; + n -= len; + } + + *_nsegs = nsegs; + return span; +} + static int -cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, +cifs_write_from_iter(loff_t fpos, size_t len, struct iov_iter *from, struct cifsFileInfo *open_file, struct cifs_sb_info *cifs_sb, struct list_head *wdata_list, struct cifs_aio_ctx *ctx) { int rc = 0; - size_t cur_len; - unsigned long nr_pages, num_pages, i; + size_t cur_len, max_len; struct cifs_writedata *wdata; - struct iov_iter saved_from = *from; - loff_t saved_offset = offset; pid_t pid; struct TCP_Server_Info *server; - struct page **pagevec; - size_t start; - unsigned int xid; + unsigned int xid, max_segs = INT_MAX; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; @@ -3268,10 +3364,20 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses); xid = get_xid(); +#ifdef CONFIG_CIFS_SMB_DIRECT + if (server->smbd_conn) + max_segs = server->smbd_conn->max_frmr_depth; +#endif + do { - unsigned int wsize; struct cifs_credits credits_on_stack; struct cifs_credits *credits = &credits_on_stack; + unsigned int wsize, nsegs = 0; + + if (signal_pending(current)) { + rc = -EINTR; + break; + } if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, false); @@ -3286,99 +3392,42 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, if (rc) break; - cur_len = min_t(const size_t, len, wsize); - - if (ctx->direct_io) { - ssize_t result; - - result = iov_iter_get_pages_alloc2( - from, &pagevec, cur_len, &start); - if (result < 0) { - cifs_dbg(VFS, - "direct_writev couldn't get user pages (rc=%zd) iter type %d iov_offset %zd count %zd\n", - result, iov_iter_type(from), - from->iov_offset, from->count); - dump_stack(); - - rc = result; - add_credits_and_wake_if(server, credits, 0); - break; - } - cur_len = (size_t)result; - - nr_pages = - (cur_len + start + PAGE_SIZE - 1) / PAGE_SIZE; - - wdata = cifs_writedata_direct_alloc(pagevec, - cifs_uncached_writev_complete); - if (!wdata) { - rc = -ENOMEM; - for (i = 0; i < nr_pages; i++) - put_page(pagevec[i]); - kvfree(pagevec); - add_credits_and_wake_if(server, credits, 0); - break; - } - - - wdata->page_offset = start; - wdata->tailsz = - nr_pages > 1 ? - cur_len - (PAGE_SIZE - start) - - (nr_pages - 2) * PAGE_SIZE : - cur_len; - } else { - nr_pages = get_numpages(wsize, len, &cur_len); - wdata = cifs_writedata_alloc(nr_pages, - cifs_uncached_writev_complete); - if (!wdata) { - rc = -ENOMEM; - add_credits_and_wake_if(server, credits, 0); - break; - } - - rc = cifs_write_allocate_pages(wdata->pages, nr_pages); - if (rc) { - kvfree(wdata->pages); - kfree(wdata); - add_credits_and_wake_if(server, credits, 0); - break; - } - - num_pages = nr_pages; - rc = wdata_fill_from_iovec( - wdata, from, &cur_len, &num_pages); - if (rc) { - for (i = 0; i < nr_pages; i++) - put_page(wdata->pages[i]); - kvfree(wdata->pages); - kfree(wdata); - add_credits_and_wake_if(server, credits, 0); - break; - } + max_len = min_t(const size_t, len, wsize); + if (!max_len) { + rc = -EAGAIN; + add_credits_and_wake_if(server, credits, 0); + break; + } - /* - * Bring nr_pages down to the number of pages we - * actually used, and free any pages that we didn't use. - */ - for ( ; nr_pages > num_pages; nr_pages--) - put_page(wdata->pages[nr_pages - 1]); + cur_len = cifs_limit_bvec_subset(from, max_len, max_segs, &nsegs); + cifs_dbg(FYI, "write_from_iter len=%zx/%zx nsegs=%u/%lu/%u\n", + cur_len, max_len, nsegs, from->nr_segs, max_segs); + if (cur_len == 0) { + rc = -EIO; + add_credits_and_wake_if(server, credits, 0); + break; + } - wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE); + wdata = cifs_writedata_alloc(cifs_uncached_writev_complete); + if (!wdata) { + rc = -ENOMEM; + add_credits_and_wake_if(server, credits, 0); + break; } wdata->sync_mode = WB_SYNC_ALL; - wdata->nr_pages = nr_pages; - wdata->offset = (__u64)offset; - wdata->cfile = cifsFileInfo_get(open_file); - wdata->server = server; - wdata->pid = pid; - wdata->bytes = cur_len; - wdata->pagesz = PAGE_SIZE; - wdata->credits = credits_on_stack; - wdata->ctx = ctx; + wdata->offset = (__u64)fpos; + wdata->cfile = cifsFileInfo_get(open_file); + wdata->server = server; + wdata->pid = pid; + wdata->bytes = cur_len; + wdata->credits = credits_on_stack; + wdata->iter = *from; + wdata->ctx = ctx; kref_get(&ctx->refcount); + iov_iter_truncate(&wdata->iter, cur_len); + rc = adjust_credits(server, &wdata->credits, wdata->bytes); if (!rc) { @@ -3393,16 +3442,14 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, add_credits_and_wake_if(server, &wdata->credits, 0); kref_put(&wdata->refcount, cifs_uncached_writedata_release); - if (rc == -EAGAIN) { - *from = saved_from; - iov_iter_advance(from, offset - saved_offset); + if (rc == -EAGAIN) continue; - } break; } list_add_tail(&wdata->list, wdata_list); - offset += cur_len; + iov_iter_advance(from, cur_len); + fpos += cur_len; len -= cur_len; } while (len > 0); @@ -3501,20 +3548,8 @@ static ssize_t __cifs_writev( struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; struct cifs_aio_ctx *ctx; - struct iov_iter saved_from = *from; - size_t len = iov_iter_count(from); int rc; - /* - * iov_iter_get_pages_alloc doesn't work with ITER_KVEC. - * In this case, fall back to non-direct write function. - * this could be improved by getting pages directly in ITER_KVEC - */ - if (direct && iov_iter_is_kvec(from)) { - cifs_dbg(FYI, "use non-direct cifs_writev for kvec I/O\n"); - direct = false; - } - rc = generic_write_checks(iocb, from); if (rc <= 0) return rc; @@ -3536,23 +3571,54 @@ static ssize_t __cifs_writev( ctx->iocb = iocb; ctx->pos = iocb->ki_pos; + ctx->direct_io = direct; + ctx->nr_pinned_pages = 0; - if (direct) { - ctx->direct_io = true; - ctx->iter = *from; - ctx->len = len; - } else { - rc = setup_aio_ctx_iter(ctx, from, ITER_SOURCE); - if (rc) { + if (user_backed_iter(from)) { + /* + * Extract IOVEC/UBUF-type iterators to a BVEC-type iterator as + * they contain references to the calling process's virtual + * memory layout which won't be available in an async worker + * thread. This also takes a pin on every folio involved. + */ + rc = netfs_extract_user_iter(from, iov_iter_count(from), + &ctx->iter, 0); + if (rc < 0) { kref_put(&ctx->refcount, cifs_aio_ctx_release); return rc; } + + ctx->nr_pinned_pages = rc; + ctx->bv = (void *)ctx->iter.bvec; + ctx->bv_need_unpin = iov_iter_extract_will_pin(&ctx->iter); + } else if ((iov_iter_is_bvec(from) || iov_iter_is_kvec(from)) && + !is_sync_kiocb(iocb)) { + /* + * If the op is asynchronous, we need to copy the list attached + * to a BVEC/KVEC-type iterator, but we assume that the storage + * will be pinned by the caller; in any case, we may or may not + * be able to pin the pages, so we don't try. + */ + ctx->bv = (void *)dup_iter(&ctx->iter, from, GFP_KERNEL); + if (!ctx->bv) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return -ENOMEM; + } + } else { + /* + * Otherwise, we just pass the iterator down as-is and rely on + * the caller to make sure the pages referred to by the + * iterator don't evaporate. + */ + ctx->iter = *from; } + ctx->len = iov_iter_count(&ctx->iter); + /* grab a lock here due to read response handlers can access ctx */ mutex_lock(&ctx->aio_mutex); - rc = cifs_write_from_iter(iocb->ki_pos, ctx->len, &saved_from, + rc = cifs_write_from_iter(iocb->ki_pos, ctx->len, &ctx->iter, cfile, cifs_sb, &ctx->list, ctx); /* @@ -3695,14 +3761,12 @@ out: return written; } -static struct cifs_readdata * -cifs_readdata_direct_alloc(struct page **pages, work_func_t complete) +static struct cifs_readdata *cifs_readdata_alloc(work_func_t complete) { struct cifs_readdata *rdata; rdata = kzalloc(sizeof(*rdata), GFP_KERNEL); - if (rdata != NULL) { - rdata->pages = pages; + if (rdata) { kref_init(&rdata->refcount); INIT_LIST_HEAD(&rdata->list); init_completion(&rdata->done); @@ -3712,27 +3776,14 @@ cifs_readdata_direct_alloc(struct page **pages, work_func_t complete) return rdata; } -static struct cifs_readdata * -cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete) -{ - struct page **pages = - kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); - struct cifs_readdata *ret = NULL; - - if (pages) { - ret = cifs_readdata_direct_alloc(pages, complete); - if (!ret) - kfree(pages); - } - - return ret; -} - void cifs_readdata_release(struct kref *refcount) { struct cifs_readdata *rdata = container_of(refcount, struct cifs_readdata, refcount); + + if (rdata->ctx) + kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release); #ifdef CONFIG_CIFS_SMB_DIRECT if (rdata->mr) { smbd_deregister_mr(rdata->mr); @@ -3742,85 +3793,9 @@ cifs_readdata_release(struct kref *refcount) if (rdata->cfile) cifsFileInfo_put(rdata->cfile); - kvfree(rdata->pages); kfree(rdata); } -static int -cifs_read_allocate_pages(struct cifs_readdata *rdata, unsigned int nr_pages) -{ - int rc = 0; - struct page *page; - unsigned int i; - - for (i = 0; i < nr_pages; i++) { - page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); - if (!page) { - rc = -ENOMEM; - break; - } - rdata->pages[i] = page; - } - - if (rc) { - unsigned int nr_page_failed = i; - - for (i = 0; i < nr_page_failed; i++) { - put_page(rdata->pages[i]); - rdata->pages[i] = NULL; - } - } - return rc; -} - -static void -cifs_uncached_readdata_release(struct kref *refcount) -{ - struct cifs_readdata *rdata = container_of(refcount, - struct cifs_readdata, refcount); - unsigned int i; - - kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release); - for (i = 0; i < rdata->nr_pages; i++) { - put_page(rdata->pages[i]); - } - cifs_readdata_release(refcount); -} - -/** - * cifs_readdata_to_iov - copy data from pages in response to an iovec - * @rdata: the readdata response with list of pages holding data - * @iter: destination for our data - * - * This function copies data from a list of pages in a readdata response into - * an array of iovecs. It will first calculate where the data should go - * based on the info in the readdata and then copy the data into that spot. - */ -static int -cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) -{ - size_t remaining = rdata->got_bytes; - unsigned int i; - - for (i = 0; i < rdata->nr_pages; i++) { - struct page *page = rdata->pages[i]; - size_t copy = min_t(size_t, remaining, PAGE_SIZE); - size_t written; - - if (unlikely(iov_iter_is_pipe(iter))) { - void *addr = kmap_atomic(page); - - written = copy_to_iter(addr, copy, iter); - kunmap_atomic(addr); - } else - written = copy_page_to_iter(page, 0, copy, iter); - remaining -= written; - if (written < copy && iov_iter_count(iter) > 0) - break; - } - return remaining ? -EFAULT : 0; -} - static void collect_uncached_read_data(struct cifs_aio_ctx *ctx); static void @@ -3832,81 +3807,7 @@ cifs_uncached_readv_complete(struct work_struct *work) complete(&rdata->done); collect_uncached_read_data(rdata->ctx); /* the below call can possibly free the last ref to aio ctx */ - kref_put(&rdata->refcount, cifs_uncached_readdata_release); -} - -static int -uncached_fill_pages(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, struct iov_iter *iter, - unsigned int len) -{ - int result = 0; - unsigned int i; - unsigned int nr_pages = rdata->nr_pages; - unsigned int page_offset = rdata->page_offset; - - rdata->got_bytes = 0; - rdata->tailsz = PAGE_SIZE; - for (i = 0; i < nr_pages; i++) { - struct page *page = rdata->pages[i]; - size_t n; - unsigned int segment_size = rdata->pagesz; - - if (i == 0) - segment_size -= page_offset; - else - page_offset = 0; - - - if (len <= 0) { - /* no need to hold page hostage */ - rdata->pages[i] = NULL; - rdata->nr_pages--; - put_page(page); - continue; - } - - n = len; - if (len >= segment_size) - /* enough data to fill the page */ - n = segment_size; - else - rdata->tailsz = len; - len -= n; - - if (iter) - result = copy_page_from_iter( - page, page_offset, n, iter); -#ifdef CONFIG_CIFS_SMB_DIRECT - else if (rdata->mr) - result = n; -#endif - else - result = cifs_read_page_from_socket( - server, page, page_offset, n); - if (result < 0) - break; - - rdata->got_bytes += result; - } - - return result != -ECONNABORTED && rdata->got_bytes > 0 ? - rdata->got_bytes : result; -} - -static int -cifs_uncached_read_into_pages(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, unsigned int len) -{ - return uncached_fill_pages(server, rdata, NULL, len); -} - -static int -cifs_uncached_copy_into_pages(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, - struct iov_iter *iter) -{ - return uncached_fill_pages(server, rdata, iter, iter->count); + kref_put(&rdata->refcount, cifs_readdata_release); } static int cifs_resend_rdata(struct cifs_readdata *rdata, @@ -3977,37 +3878,36 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata, } while (rc == -EAGAIN); fail: - kref_put(&rdata->refcount, cifs_uncached_readdata_release); + kref_put(&rdata->refcount, cifs_readdata_release); return rc; } static int -cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, +cifs_send_async_read(loff_t fpos, size_t len, struct cifsFileInfo *open_file, struct cifs_sb_info *cifs_sb, struct list_head *rdata_list, struct cifs_aio_ctx *ctx) { struct cifs_readdata *rdata; - unsigned int npages, rsize; + unsigned int rsize, nsegs, max_segs = INT_MAX; struct cifs_credits credits_on_stack; struct cifs_credits *credits = &credits_on_stack; - size_t cur_len; + size_t cur_len, max_len; int rc; pid_t pid; struct TCP_Server_Info *server; - struct page **pagevec; - size_t start; - struct iov_iter direct_iov = ctx->iter; server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses); +#ifdef CONFIG_CIFS_SMB_DIRECT + if (server->smbd_conn) + max_segs = server->smbd_conn->max_frmr_depth; +#endif + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; else pid = current->tgid; - if (ctx->direct_io) - iov_iter_advance(&direct_iov, offset - ctx->pos); - do { if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, true); @@ -4027,78 +3927,37 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, if (rc) break; - cur_len = min_t(const size_t, len, rsize); - - if (ctx->direct_io) { - ssize_t result; - - result = iov_iter_get_pages_alloc2( - &direct_iov, &pagevec, - cur_len, &start); - if (result < 0) { - cifs_dbg(VFS, - "Couldn't get user pages (rc=%zd) iter type %d iov_offset %zd count %zd\n", - result, iov_iter_type(&direct_iov), - direct_iov.iov_offset, - direct_iov.count); - dump_stack(); - - rc = result; - add_credits_and_wake_if(server, credits, 0); - break; - } - cur_len = (size_t)result; - - rdata = cifs_readdata_direct_alloc( - pagevec, cifs_uncached_readv_complete); - if (!rdata) { - add_credits_and_wake_if(server, credits, 0); - rc = -ENOMEM; - break; - } - - npages = (cur_len + start + PAGE_SIZE-1) / PAGE_SIZE; - rdata->page_offset = start; - rdata->tailsz = npages > 1 ? - cur_len-(PAGE_SIZE-start)-(npages-2)*PAGE_SIZE : - cur_len; - - } else { + max_len = min_t(size_t, len, rsize); - npages = DIV_ROUND_UP(cur_len, PAGE_SIZE); - /* allocate a readdata struct */ - rdata = cifs_readdata_alloc(npages, - cifs_uncached_readv_complete); - if (!rdata) { - add_credits_and_wake_if(server, credits, 0); - rc = -ENOMEM; - break; - } - - rc = cifs_read_allocate_pages(rdata, npages); - if (rc) { - kvfree(rdata->pages); - kfree(rdata); - add_credits_and_wake_if(server, credits, 0); - break; - } + cur_len = cifs_limit_bvec_subset(&ctx->iter, max_len, + max_segs, &nsegs); + cifs_dbg(FYI, "read-to-iter len=%zx/%zx nsegs=%u/%lu/%u\n", + cur_len, max_len, nsegs, ctx->iter.nr_segs, max_segs); + if (cur_len == 0) { + rc = -EIO; + add_credits_and_wake_if(server, credits, 0); + break; + } - rdata->tailsz = PAGE_SIZE; + rdata = cifs_readdata_alloc(cifs_uncached_readv_complete); + if (!rdata) { + add_credits_and_wake_if(server, credits, 0); + rc = -ENOMEM; + break; } - rdata->server = server; - rdata->cfile = cifsFileInfo_get(open_file); - rdata->nr_pages = npages; - rdata->offset = offset; - rdata->bytes = cur_len; - rdata->pid = pid; - rdata->pagesz = PAGE_SIZE; - rdata->read_into_pages = cifs_uncached_read_into_pages; - rdata->copy_into_pages = cifs_uncached_copy_into_pages; - rdata->credits = credits_on_stack; - rdata->ctx = ctx; + rdata->server = server; + rdata->cfile = cifsFileInfo_get(open_file); + rdata->offset = fpos; + rdata->bytes = cur_len; + rdata->pid = pid; + rdata->credits = credits_on_stack; + rdata->ctx = ctx; kref_get(&ctx->refcount); + rdata->iter = ctx->iter; + iov_iter_truncate(&rdata->iter, cur_len); + rc = adjust_credits(server, &rdata->credits, rdata->bytes); if (!rc) { @@ -4110,17 +3969,15 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, if (rc) { add_credits_and_wake_if(server, &rdata->credits, 0); - kref_put(&rdata->refcount, - cifs_uncached_readdata_release); - if (rc == -EAGAIN) { - iov_iter_revert(&direct_iov, cur_len); + kref_put(&rdata->refcount, cifs_readdata_release); + if (rc == -EAGAIN) continue; - } break; } list_add_tail(&rdata->list, rdata_list); - offset += cur_len; + iov_iter_advance(&ctx->iter, cur_len); + fpos += cur_len; len -= cur_len; } while (len > 0); @@ -4162,22 +4019,6 @@ again: list_del_init(&rdata->list); INIT_LIST_HEAD(&tmp_list); - /* - * Got a part of data and then reconnect has - * happened -- fill the buffer and continue - * reading. - */ - if (got_bytes && got_bytes < rdata->bytes) { - rc = 0; - if (!ctx->direct_io) - rc = cifs_readdata_to_iov(rdata, to); - if (rc) { - kref_put(&rdata->refcount, - cifs_uncached_readdata_release); - continue; - } - } - if (ctx->direct_io) { /* * Re-use rdata as this is a @@ -4194,7 +4035,7 @@ again: &tmp_list, ctx); kref_put(&rdata->refcount, - cifs_uncached_readdata_release); + cifs_readdata_release); } list_splice(&tmp_list, &ctx->list); @@ -4202,8 +4043,6 @@ again: goto again; } else if (rdata->result) rc = rdata->result; - else if (!ctx->direct_io) - rc = cifs_readdata_to_iov(rdata, to); /* if there was a short read -- discard anything left */ if (rdata->got_bytes && rdata->got_bytes < rdata->bytes) @@ -4212,7 +4051,7 @@ again: ctx->total_len += rdata->got_bytes; } list_del_init(&rdata->list); - kref_put(&rdata->refcount, cifs_uncached_readdata_release); + kref_put(&rdata->refcount, cifs_readdata_release); } if (!ctx->direct_io) @@ -4244,16 +4083,6 @@ static ssize_t __cifs_readv( loff_t offset = iocb->ki_pos; struct cifs_aio_ctx *ctx; - /* - * iov_iter_get_pages_alloc() doesn't work with ITER_KVEC, - * fall back to data copy read path - * this could be improved by getting pages directly in ITER_KVEC - */ - if (direct && iov_iter_is_kvec(to)) { - cifs_dbg(FYI, "use non-direct cifs_user_readv for kvec I/O\n"); - direct = false; - } - len = iov_iter_count(to); if (!len) return 0; @@ -4272,26 +4101,53 @@ static ssize_t __cifs_readv( if (!ctx) return -ENOMEM; - ctx->cfile = cifsFileInfo_get(cfile); + ctx->pos = offset; + ctx->direct_io = direct; + ctx->len = len; + ctx->cfile = cifsFileInfo_get(cfile); + ctx->nr_pinned_pages = 0; if (!is_sync_kiocb(iocb)) ctx->iocb = iocb; - if (user_backed_iter(to)) - ctx->should_dirty = true; - - if (direct) { - ctx->pos = offset; - ctx->direct_io = true; - ctx->iter = *to; - ctx->len = len; - } else { - rc = setup_aio_ctx_iter(ctx, to, ITER_DEST); - if (rc) { + if (user_backed_iter(to)) { + /* + * Extract IOVEC/UBUF-type iterators to a BVEC-type iterator as + * they contain references to the calling process's virtual + * memory layout which won't be available in an async worker + * thread. This also takes a pin on every folio involved. + */ + rc = netfs_extract_user_iter(to, iov_iter_count(to), + &ctx->iter, 0); + if (rc < 0) { kref_put(&ctx->refcount, cifs_aio_ctx_release); return rc; } - len = ctx->len; + + ctx->nr_pinned_pages = rc; + ctx->bv = (void *)ctx->iter.bvec; + ctx->bv_need_unpin = iov_iter_extract_will_pin(&ctx->iter); + ctx->should_dirty = true; + } else if ((iov_iter_is_bvec(to) || iov_iter_is_kvec(to)) && + !is_sync_kiocb(iocb)) { + /* + * If the op is asynchronous, we need to copy the list attached + * to a BVEC/KVEC-type iterator, but we assume that the storage + * will be retained by the caller; in any case, we may or may + * not be able to pin the pages, so we don't try. + */ + ctx->bv = (void *)dup_iter(&ctx->iter, to, GFP_KERNEL); + if (!ctx->bv) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return -ENOMEM; + } + } else { + /* + * Otherwise, we just pass the iterator down as-is and rely on + * the caller to make sure the pages referred to by the + * iterator don't evaporate. + */ + ctx->iter = *to; } if (direct) { @@ -4490,23 +4346,22 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) * If the page is mmap'ed into a process' page tables, then we need to make * sure that it doesn't change while being written back. */ -static vm_fault_t -cifs_page_mkwrite(struct vm_fault *vmf) +static vm_fault_t cifs_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); - /* Wait for the page to be written to the cache before we allow it to - * be modified. We then assume the entire page will need writing back. + /* Wait for the folio to be written to the cache before we allow it to + * be modified. We then assume the entire folio will need writing back. */ #ifdef CONFIG_CIFS_FSCACHE - if (PageFsCache(page) && - wait_on_page_fscache_killable(page) < 0) + if (folio_test_fscache(folio) && + folio_wait_fscache_killable(folio) < 0) return VM_FAULT_RETRY; #endif - wait_on_page_writeback(page); + folio_wait_writeback(folio); - if (lock_page_killable(page) < 0) + if (folio_lock_killable(folio) < 0) return VM_FAULT_RETRY; return VM_FAULT_LOCKED; } @@ -4554,149 +4409,72 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) return rc; } -static void -cifs_readv_complete(struct work_struct *work) +/* + * Unlock a bunch of folios in the pagecache. + */ +static void cifs_unlock_folios(struct address_space *mapping, pgoff_t first, pgoff_t last) { - unsigned int i, got_bytes; - struct cifs_readdata *rdata = container_of(work, - struct cifs_readdata, work); + struct folio *folio; + XA_STATE(xas, &mapping->i_pages, first); - got_bytes = rdata->got_bytes; - for (i = 0; i < rdata->nr_pages; i++) { - struct page *page = rdata->pages[i]; - - if (rdata->result == 0 || - (rdata->result == -EAGAIN && got_bytes)) { - flush_dcache_page(page); - SetPageUptodate(page); - } else - SetPageError(page); - - if (rdata->result == 0 || - (rdata->result == -EAGAIN && got_bytes)) - cifs_readpage_to_fscache(rdata->mapping->host, page); - - unlock_page(page); - - got_bytes -= min_t(unsigned int, PAGE_SIZE, got_bytes); - - put_page(page); - rdata->pages[i] = NULL; + rcu_read_lock(); + xas_for_each(&xas, folio, last) { + folio_unlock(folio); } - kref_put(&rdata->refcount, cifs_readdata_release); + rcu_read_unlock(); } -static int -readpages_fill_pages(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, struct iov_iter *iter, - unsigned int len) +static void cifs_readahead_complete(struct work_struct *work) { - int result = 0; - unsigned int i; - u64 eof; - pgoff_t eof_index; - unsigned int nr_pages = rdata->nr_pages; - unsigned int page_offset = rdata->page_offset; - - /* determine the eof that the server (probably) has */ - eof = CIFS_I(rdata->mapping->host)->server_eof; - eof_index = eof ? (eof - 1) >> PAGE_SHIFT : 0; - cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index); - - rdata->got_bytes = 0; - rdata->tailsz = PAGE_SIZE; - for (i = 0; i < nr_pages; i++) { - struct page *page = rdata->pages[i]; - unsigned int to_read = rdata->pagesz; - size_t n; - - if (i == 0) - to_read -= page_offset; - else - page_offset = 0; - - n = to_read; - - if (len >= to_read) { - len -= to_read; - } else if (len > 0) { - /* enough for partial page, fill and zero the rest */ - zero_user(page, len + page_offset, to_read - len); - n = rdata->tailsz = len; - len = 0; - } else if (page->index > eof_index) { - /* - * The VFS will not try to do readahead past the - * i_size, but it's possible that we have outstanding - * writes with gaps in the middle and the i_size hasn't - * caught up yet. Populate those with zeroed out pages - * to prevent the VFS from repeatedly attempting to - * fill them until the writes are flushed. - */ - zero_user(page, 0, PAGE_SIZE); - flush_dcache_page(page); - SetPageUptodate(page); - unlock_page(page); - put_page(page); - rdata->pages[i] = NULL; - rdata->nr_pages--; - continue; - } else { - /* no need to hold page hostage */ - unlock_page(page); - put_page(page); - rdata->pages[i] = NULL; - rdata->nr_pages--; - continue; - } + struct cifs_readdata *rdata = container_of(work, + struct cifs_readdata, work); + struct folio *folio; + pgoff_t last; + bool good = rdata->result == 0 || (rdata->result == -EAGAIN && rdata->got_bytes); - if (iter) - result = copy_page_from_iter( - page, page_offset, n, iter); -#ifdef CONFIG_CIFS_SMB_DIRECT - else if (rdata->mr) - result = n; -#endif - else - result = cifs_read_page_from_socket( - server, page, page_offset, n); - if (result < 0) - break; + XA_STATE(xas, &rdata->mapping->i_pages, rdata->offset / PAGE_SIZE); - rdata->got_bytes += result; - } + if (good) + cifs_readahead_to_fscache(rdata->mapping->host, + rdata->offset, rdata->bytes); - return result != -ECONNABORTED && rdata->got_bytes > 0 ? - rdata->got_bytes : result; -} + if (iov_iter_count(&rdata->iter) > 0) + iov_iter_zero(iov_iter_count(&rdata->iter), &rdata->iter); -static int -cifs_readpages_read_into_pages(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, unsigned int len) -{ - return readpages_fill_pages(server, rdata, NULL, len); -} + last = (rdata->offset + rdata->bytes - 1) / PAGE_SIZE; -static int -cifs_readpages_copy_into_pages(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, - struct iov_iter *iter) -{ - return readpages_fill_pages(server, rdata, iter, iter->count); + rcu_read_lock(); + xas_for_each(&xas, folio, last) { + if (good) { + flush_dcache_folio(folio); + folio_mark_uptodate(folio); + } + folio_unlock(folio); + } + rcu_read_unlock(); + + kref_put(&rdata->refcount, cifs_readdata_release); } static void cifs_readahead(struct readahead_control *ractl) { - int rc; struct cifsFileInfo *open_file = ractl->file->private_data; struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(ractl->file); struct TCP_Server_Info *server; - pid_t pid; - unsigned int xid, nr_pages, last_batch_size = 0, cache_nr_pages = 0; - pgoff_t next_cached = ULONG_MAX; + unsigned int xid, nr_pages, cache_nr_pages = 0; + unsigned int ra_pages; + pgoff_t next_cached = ULONG_MAX, ra_index; bool caching = fscache_cookie_enabled(cifs_inode_cookie(ractl->mapping->host)) && cifs_inode_cookie(ractl->mapping->host)->cache_priv; bool check_cache = caching; + pid_t pid; + int rc = 0; + + /* Note that readahead_count() lags behind our dequeuing of pages from + * the ractl, wo we have to keep track for ourselves. + */ + ra_pages = readahead_count(ractl); + ra_index = readahead_index(ractl); xid = get_xid(); @@ -4705,22 +4483,21 @@ static void cifs_readahead(struct readahead_control *ractl) else pid = current->tgid; - rc = 0; server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses); cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n", - __func__, ractl->file, ractl->mapping, readahead_count(ractl)); + __func__, ractl->file, ractl->mapping, ra_pages); /* * Chop the readahead request up into rsize-sized read requests. */ - while ((nr_pages = readahead_count(ractl) - last_batch_size)) { - unsigned int i, got, rsize; - struct page *page; + while ((nr_pages = ra_pages)) { + unsigned int i, rsize; struct cifs_readdata *rdata; struct cifs_credits credits_on_stack; struct cifs_credits *credits = &credits_on_stack; - pgoff_t index = readahead_index(ractl) + last_batch_size; + struct folio *folio; + pgoff_t fsize; /* * Find out if we have anything cached in the range of @@ -4729,21 +4506,22 @@ static void cifs_readahead(struct readahead_control *ractl) if (caching) { if (check_cache) { rc = cifs_fscache_query_occupancy( - ractl->mapping->host, index, nr_pages, + ractl->mapping->host, ra_index, nr_pages, &next_cached, &cache_nr_pages); if (rc < 0) caching = false; check_cache = false; } - if (index == next_cached) { + if (ra_index == next_cached) { /* * TODO: Send a whole batch of pages to be read * by the cache. */ - struct folio *folio = readahead_folio(ractl); - - last_batch_size = folio_nr_pages(folio); + folio = readahead_folio(ractl); + fsize = folio_nr_pages(folio); + ra_pages -= fsize; + ra_index += fsize; if (cifs_readpage_from_fscache(ractl->mapping->host, &folio->page) < 0) { /* @@ -4754,8 +4532,8 @@ static void cifs_readahead(struct readahead_control *ractl) caching = false; } folio_unlock(folio); - next_cached++; - cache_nr_pages--; + next_cached += fsize; + cache_nr_pages -= fsize; if (cache_nr_pages == 0) check_cache = true; continue; @@ -4780,8 +4558,9 @@ static void cifs_readahead(struct readahead_control *ractl) &rsize, credits); if (rc) break; - nr_pages = min_t(size_t, rsize / PAGE_SIZE, readahead_count(ractl)); - nr_pages = min_t(size_t, nr_pages, next_cached - index); + nr_pages = min_t(size_t, rsize / PAGE_SIZE, ra_pages); + if (next_cached != ULONG_MAX) + nr_pages = min_t(size_t, nr_pages, next_cached - ra_index); /* * Give up immediately if rsize is too small to read an entire @@ -4794,33 +4573,31 @@ static void cifs_readahead(struct readahead_control *ractl) break; } - rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete); + rdata = cifs_readdata_alloc(cifs_readahead_complete); if (!rdata) { /* best to give up if we're out of mem */ add_credits_and_wake_if(server, credits, 0); break; } - got = __readahead_batch(ractl, rdata->pages, nr_pages); - if (got != nr_pages) { - pr_warn("__readahead_batch() returned %u/%u\n", - got, nr_pages); - nr_pages = got; - } - - rdata->nr_pages = nr_pages; - rdata->bytes = readahead_batch_length(ractl); + rdata->offset = ra_index * PAGE_SIZE; + rdata->bytes = nr_pages * PAGE_SIZE; rdata->cfile = cifsFileInfo_get(open_file); rdata->server = server; rdata->mapping = ractl->mapping; - rdata->offset = readahead_pos(ractl); rdata->pid = pid; - rdata->pagesz = PAGE_SIZE; - rdata->tailsz = PAGE_SIZE; - rdata->read_into_pages = cifs_readpages_read_into_pages; - rdata->copy_into_pages = cifs_readpages_copy_into_pages; rdata->credits = credits_on_stack; + for (i = 0; i < nr_pages; i++) { + if (!readahead_folio(ractl)) + WARN_ON(1); + } + ra_pages -= nr_pages; + ra_index += nr_pages; + + iov_iter_xarray(&rdata->iter, ITER_DEST, &rdata->mapping->i_pages, + rdata->offset, rdata->bytes); + rc = adjust_credits(server, &rdata->credits, rdata->bytes); if (!rc) { if (rdata->cfile->invalidHandle) @@ -4831,18 +4608,15 @@ static void cifs_readahead(struct readahead_control *ractl) if (rc) { add_credits_and_wake_if(server, &rdata->credits, 0); - for (i = 0; i < rdata->nr_pages; i++) { - page = rdata->pages[i]; - unlock_page(page); - put_page(page); - } + cifs_unlock_folios(rdata->mapping, + rdata->offset / PAGE_SIZE, + (rdata->offset + rdata->bytes - 1) / PAGE_SIZE); /* Fallback to the readpage in error/reconnect cases */ kref_put(&rdata->refcount, cifs_readdata_release); break; } kref_put(&rdata->refcount, cifs_readdata_release); - last_batch_size = nr_pages; } free_xid(xid); @@ -4884,10 +4658,6 @@ static int cifs_readpage_worker(struct file *file, struct page *page, flush_dcache_page(page); SetPageUptodate(page); - - /* send this page to the cache */ - cifs_readpage_to_fscache(file_inode(file), page); - rc = 0; io_error: @@ -5274,3 +5044,19 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .launder_folio = cifs_launder_folio, .migrate_folio = filemap_migrate_folio, }; + +/* + * Splice data from a file into a pipe. + */ +ssize_t cifs_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + if (unlikely(*ppos >= file_inode(in)->i_sb->s_maxbytes)) + return 0; + if (unlikely(!len)) + return 0; + if (in->f_flags & O_DIRECT) + return direct_splice_read(in, ppos, pipe, len, flags); + return filemap_splice_read(in, ppos, pipe, len, flags); +} diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 0911327ebfde..8f6909d633da 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -163,20 +163,16 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page) /* * Fallback page writing interface. */ -static int fscache_fallback_write_page(struct inode *inode, struct page *page, - bool no_space_allocated_yet) +static int fscache_fallback_write_pages(struct inode *inode, loff_t start, size_t len, + bool no_space_allocated_yet) { struct netfs_cache_resources cres; struct fscache_cookie *cookie = cifs_inode_cookie(inode); struct iov_iter iter; - struct bio_vec bvec; - loff_t start = page_offset(page); - size_t len = PAGE_SIZE; int ret; memset(&cres, 0, sizeof(cres)); - bvec_set_page(&bvec, page, PAGE_SIZE, 0); - iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE); + iov_iter_xarray(&iter, ITER_SOURCE, &inode->i_mapping->i_pages, start, len); ret = fscache_begin_write_operation(&cres, cookie); if (ret < 0) @@ -185,7 +181,7 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page, ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode), no_space_allocated_yet); if (ret == 0) - ret = fscache_write(&cres, page_offset(page), &iter, NULL, NULL); + ret = fscache_write(&cres, start, &iter, NULL, NULL); fscache_end_operation(&cres); return ret; } @@ -209,12 +205,12 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) return 0; } -void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) +void __cifs_readahead_to_fscache(struct inode *inode, loff_t pos, size_t len) { - cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n", - __func__, cifs_inode_cookie(inode), page, inode); + cifs_dbg(FYI, "%s: (fsc: %p, p: %llx, l: %zx, i: %p)\n", + __func__, cifs_inode_cookie(inode), pos, len, inode); - fscache_fallback_write_page(inode, page, true); + fscache_fallback_write_pages(inode, pos, len, true); } /* diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index 67b601041f0a..173999610997 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -90,7 +90,7 @@ static inline int cifs_fscache_query_occupancy(struct inode *inode, } extern int __cifs_readpage_from_fscache(struct inode *pinode, struct page *ppage); -extern void __cifs_readpage_to_fscache(struct inode *pinode, struct page *ppage); +extern void __cifs_readahead_to_fscache(struct inode *pinode, loff_t pos, size_t len); static inline int cifs_readpage_from_fscache(struct inode *inode, @@ -101,11 +101,11 @@ static inline int cifs_readpage_from_fscache(struct inode *inode, return -ENOBUFS; } -static inline void cifs_readpage_to_fscache(struct inode *inode, - struct page *page) +static inline void cifs_readahead_to_fscache(struct inode *inode, + loff_t pos, size_t len) { if (cifs_inode_cookie(inode)) - __cifs_readpage_to_fscache(inode, page); + __cifs_readahead_to_fscache(inode, pos, len); } #else /* CONFIG_CIFS_FSCACHE */ @@ -141,7 +141,7 @@ cifs_readpage_from_fscache(struct inode *inode, struct page *page) } static inline -void cifs_readpage_to_fscache(struct inode *inode, struct page *page) {} +void cifs_readahead_to_fscache(struct inode *inode, loff_t pos, size_t len) {} #endif /* CONFIG_CIFS_FSCACHE */ diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 11cdc7cfe0ba..1087ac6104a9 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -508,14 +508,15 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, return PTR_ERR(tlink); tcon = tlink_tcon(tlink); - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_READ; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_OPEN; - oparms.path = path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_READ, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_OPEN, + .path = path, + .fid = &fid, + }; if (tcon->ses->server->oplocks) oplock = REQ_OPLOCK; @@ -1518,14 +1519,15 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, goto out; } - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = DELETE | FILE_WRITE_ATTRIBUTES; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_OPEN; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = DELETE | FILE_WRITE_ATTRIBUTES, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_OPEN, + .path = full_path, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc != 0) @@ -2112,15 +2114,16 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, if (to_dentry->d_parent != from_dentry->d_parent) goto do_rename_exit; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - /* open the file to be renamed -- we need DELETE perms */ - oparms.desired_access = DELETE; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_OPEN; - oparms.path = from_path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + /* open the file to be renamed -- we need DELETE perms */ + .desired_access = DELETE, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_OPEN, + .path = from_path, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc == 0) { diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 4510dea77be3..7d97c10f2453 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -271,14 +271,15 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, int buf_type = CIFS_NO_BUFFER; FILE_ALL_INFO file_info; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_READ; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_OPEN; - oparms.path = path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_READ, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_OPEN, + .path = path, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, &file_info); if (rc) @@ -313,14 +314,15 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_io_parms io_parms = {0}; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_WRITE; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_CREATE; - oparms.path = path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_WRITE, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_CREATE, + .path = path, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc) @@ -355,13 +357,14 @@ smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct smb2_file_all_info *pfile_info = NULL; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_READ; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_OPEN; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_READ, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_OPEN, + .fid = &fid, + }; utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (utf16_path == NULL) @@ -421,14 +424,15 @@ smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, if (!utf16_path) return -ENOMEM; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_WRITE; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_CREATE; - oparms.fid = &fid; - oparms.reconnect = false; - oparms.mode = 0644; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_WRITE, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_CREATE, + .fid = &fid, + .mode = 0644, + }; rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL, NULL); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 95cc4d7dd806..2905734eb289 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -966,16 +966,22 @@ cifs_aio_ctx_release(struct kref *refcount) /* * ctx->bv is only set if setup_aio_ctx_iter() was call successfuly - * which means that iov_iter_get_pages() was a success and thus that - * we have taken reference on pages. + * which means that iov_iter_extract_pages() was a success and thus + * that we may have references or pins on pages that we need to + * release. */ if (ctx->bv) { - unsigned i; + if (ctx->should_dirty || ctx->bv_need_unpin) { + unsigned int i; - for (i = 0; i < ctx->npages; i++) { - if (ctx->should_dirty) - set_page_dirty(ctx->bv[i].bv_page); - put_page(ctx->bv[i].bv_page); + for (i = 0; i < ctx->nr_pinned_pages; i++) { + struct page *page = ctx->bv[i].bv_page; + + if (ctx->should_dirty) + set_page_dirty(page); + if (ctx->bv_need_unpin) + unpin_user_page(page); + } } kvfree(ctx->bv); } @@ -983,94 +989,6 @@ cifs_aio_ctx_release(struct kref *refcount) kfree(ctx); } -#define CIFS_AIO_KMALLOC_LIMIT (1024 * 1024) - -int -setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) -{ - ssize_t rc; - unsigned int cur_npages; - unsigned int npages = 0; - unsigned int i; - size_t len; - size_t count = iov_iter_count(iter); - unsigned int saved_len; - size_t start; - unsigned int max_pages = iov_iter_npages(iter, INT_MAX); - struct page **pages = NULL; - struct bio_vec *bv = NULL; - - if (iov_iter_is_kvec(iter)) { - memcpy(&ctx->iter, iter, sizeof(*iter)); - ctx->len = count; - iov_iter_advance(iter, count); - return 0; - } - - if (array_size(max_pages, sizeof(*bv)) <= CIFS_AIO_KMALLOC_LIMIT) - bv = kmalloc_array(max_pages, sizeof(*bv), GFP_KERNEL); - - if (!bv) { - bv = vmalloc(array_size(max_pages, sizeof(*bv))); - if (!bv) - return -ENOMEM; - } - - if (array_size(max_pages, sizeof(*pages)) <= CIFS_AIO_KMALLOC_LIMIT) - pages = kmalloc_array(max_pages, sizeof(*pages), GFP_KERNEL); - - if (!pages) { - pages = vmalloc(array_size(max_pages, sizeof(*pages))); - if (!pages) { - kvfree(bv); - return -ENOMEM; - } - } - - saved_len = count; - - while (count && npages < max_pages) { - rc = iov_iter_get_pages2(iter, pages, count, max_pages, &start); - if (rc < 0) { - cifs_dbg(VFS, "Couldn't get user pages (rc=%zd)\n", rc); - break; - } - - if (rc > count) { - cifs_dbg(VFS, "get pages rc=%zd more than %zu\n", rc, - count); - break; - } - - count -= rc; - rc += start; - cur_npages = DIV_ROUND_UP(rc, PAGE_SIZE); - - if (npages + cur_npages > max_pages) { - cifs_dbg(VFS, "out of vec array capacity (%u vs %u)\n", - npages + cur_npages, max_pages); - break; - } - - for (i = 0; i < cur_npages; i++) { - len = rc > PAGE_SIZE ? PAGE_SIZE : rc; - bvec_set_page(&bv[npages + i], pages[i], len - start, - start); - rc -= len; - start = 0; - } - - npages += cur_npages; - } - - kvfree(pages); - ctx->bv = bv; - ctx->len = saved_len - count; - ctx->npages = npages; - iov_iter_bvec(&ctx->iter, rw, ctx->bv, npages, ctx->len); - return 0; -} - /** * cifs_alloc_hash - allocate hash and hash context together * @name: The name of the crypto hash algo @@ -1128,25 +1046,6 @@ cifs_free_hash(struct shash_desc **sdesc) *sdesc = NULL; } -/** - * rqst_page_get_length - obtain the length and offset for a page in smb_rqst - * @rqst: The request descriptor - * @page: The index of the page to query - * @len: Where to store the length for this page: - * @offset: Where to store the offset for this page - */ -void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page, - unsigned int *len, unsigned int *offset) -{ - *len = rqst->rq_pagesz; - *offset = (page == 0) ? rqst->rq_offset : 0; - - if (rqst->rq_npages == 1 || page == rqst->rq_npages-1) - *len = rqst->rq_tailsz; - else if (page == 0) - *len = rqst->rq_pagesz - rqst->rq_offset; -} - void extract_unc_hostname(const char *unc, const char **h, size_t *len) { const char *end; diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h index 55758b9ec877..2c5dde2ece58 100644 --- a/fs/cifs/ntlmssp.h +++ b/fs/cifs/ntlmssp.h @@ -83,7 +83,7 @@ typedef struct _NEGOTIATE_MESSAGE { SECURITY_BUFFER WorkstationName; /* RFC 1001 and ASCII */ /* SECURITY_BUFFER for version info not present since we do not set the version is present flag */ - char DomainString[0]; + char DomainString[]; /* followed by WorkstationString */ } __attribute__((packed)) NEGOTIATE_MESSAGE, *PNEGOTIATE_MESSAGE; @@ -135,7 +135,7 @@ typedef struct _AUTHENTICATE_MESSAGE { __le32 NegotiateFlags; /* SECURITY_BUFFER for version info not present since we do not set the version is present flag */ - char UserString[0]; + char UserString[]; } __attribute__((packed)) AUTHENTICATE_MESSAGE, *PAUTHENTICATE_MESSAGE; /* diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 2d75ba5aaa8a..ef638086d734 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -495,7 +495,7 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level) FIND_FILE_STANDARD_INFO *pfData; pfData = (FIND_FILE_STANDARD_INFO *)pDirInfo; - new_entry = old_entry + sizeof(FIND_FILE_STANDARD_INFO) + + new_entry = old_entry + sizeof(FIND_FILE_STANDARD_INFO) + 1 + pfData->FileNameLength; } else { u32 next_offset = le32_to_cpu(pDirInfo->NextEntryOffset); @@ -513,9 +513,9 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level) new_entry, end_of_smb, old_entry); return NULL; } else if (((level == SMB_FIND_FILE_INFO_STANDARD) && - (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb)) + (new_entry + sizeof(FIND_FILE_STANDARD_INFO) + 1 > end_of_smb)) || ((level != SMB_FIND_FILE_INFO_STANDARD) && - (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb))) { + (new_entry + sizeof(FILE_DIRECTORY_INFO) + 1 > end_of_smb))) { cifs_dbg(VFS, "search entry %p extends after end of SMB %p\n", new_entry, end_of_smb); return NULL; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index c47b254f0d1e..d2cbae4b5d21 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -480,7 +480,6 @@ out: * remove this channel */ cancel_delayed_work_sync(&chan->server->echo); - cancel_delayed_work_sync(&chan->server->resolve); cancel_delayed_work_sync(&chan->server->reconnect); spin_lock(&ses->chan_lock); diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 4cb364454e13..abda6148be10 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -576,14 +576,15 @@ static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, if (!(le32_to_cpu(fi.Attributes) & ATTR_REPARSE)) return 0; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.disposition = FILE_OPEN; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = FILE_READ_ATTRIBUTES, + .create_options = cifs_create_options(cifs_sb, 0), + .disposition = FILE_OPEN, + .path = full_path, + .fid = &fid, + }; /* Need to check if this is a symbolic link or not */ tmprc = CIFS_open(xid, &oparms, &oplock, NULL); @@ -823,14 +824,15 @@ smb_set_file_info(struct inode *inode, const char *full_path, goto out; } - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = SYNCHRONIZE | FILE_WRITE_ATTRIBUTES; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_OPEN; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_OPEN, + .path = full_path, + .fid = &fid, + }; cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n"); rc = CIFS_open(xid, &oparms, &oplock, NULL); @@ -998,15 +1000,16 @@ cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, goto out; } - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.create_options = cifs_create_options(cifs_sb, - OPEN_REPARSE_POINT); - oparms.disposition = FILE_OPEN; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = FILE_READ_ATTRIBUTES, + .create_options = cifs_create_options(cifs_sb, + OPEN_REPARSE_POINT), + .disposition = FILE_OPEN, + .path = full_path, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc) @@ -1115,15 +1118,16 @@ cifs_make_node(unsigned int xid, struct inode *inode, cifs_dbg(FYI, "sfu compat create special file\n"); - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_WRITE; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR | - CREATE_OPTION_SPECIAL); - oparms.disposition = FILE_CREATE; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_WRITE, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR | + CREATE_OPTION_SPECIAL), + .disposition = FILE_CREATE, + .path = full_path, + .fid = &fid, + }; if (tcon->ses->server->oplocks) oplock = REQ_OPLOCK; diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 9f1dd04b555a..e0ee96d69d49 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -35,7 +35,7 @@ static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov) len = (u32)err->ErrorContextCount * (offsetof(struct smb2_error_context_rsp, ErrorContextData) + sizeof(struct smb2_symlink_err_rsp)); - if (le32_to_cpu(err->ByteCount) < len || iov->iov_len < len + sizeof(*err)) + if (le32_to_cpu(err->ByteCount) < len || iov->iov_len < len + sizeof(*err) + 1) return ERR_PTR(-EINVAL); p = (struct smb2_error_context_rsp *)err->ErrorData; diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 8521adf9ce79..37b4cd59245d 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -105,14 +105,15 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, goto finished; } - vars->oparms.tcon = tcon; - vars->oparms.desired_access = desired_access; - vars->oparms.disposition = create_disposition; - vars->oparms.create_options = cifs_create_options(cifs_sb, create_options); - vars->oparms.fid = &fid; - vars->oparms.reconnect = false; - vars->oparms.mode = mode; - vars->oparms.cifs_sb = cifs_sb; + vars->oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = desired_access, + .disposition = create_disposition, + .create_options = cifs_create_options(cifs_sb, create_options), + .fid = &fid, + .mode = mode, + .cifs_sb = cifs_sb, + }; rqst[num_rqst].rq_iov = &vars->open_iov[0]; rqst[num_rqst].rq_nvec = SMB2_CREATE_IOV_SIZE; diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 572293c18e16..3935a60db5c3 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -113,7 +113,7 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, } else if (nc_offset + 1 == non_ctxlen) { cifs_dbg(FYI, "no SPNEGO security blob in negprot rsp\n"); size_of_pad_before_neg_ctxts = 0; - } else if (non_ctxlen == SMB311_NEGPROT_BASE_SIZE) + } else if (non_ctxlen == SMB311_NEGPROT_BASE_SIZE + 1) /* has padding, but no SPNEGO blob */ size_of_pad_before_neg_ctxts = nc_offset - non_ctxlen + 1; else diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index cb2deac6b2d7..f79b075f2992 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -729,12 +729,13 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid fid; struct cached_fid *cfid = NULL; - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = &fid, + }; rc = open_cached_dir(xid, tcon, "", cifs_sb, false, &cfid); if (rc == 0) @@ -771,12 +772,13 @@ smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = &fid, + }; rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, NULL, NULL); @@ -816,12 +818,13 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, if (!utf16_path) return -ENOMEM; - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = &fid, + }; rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, &err_iov, &err_buftype); @@ -1097,13 +1100,13 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, rqst[0].rq_iov = open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - memset(&oparms, 0, sizeof(oparms)); - oparms.tcon = tcon; - oparms.desired_access = FILE_WRITE_EA; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_WRITE_EA, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = &fid, + }; rc = SMB2_open_init(tcon, server, &rqst[0], &oplock, &oparms, utf16_path); @@ -1453,12 +1456,12 @@ smb2_ioctl_query_info(const unsigned int xid, rqst[0].rq_iov = &vars->open_iov[0]; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - memset(&oparms, 0, sizeof(oparms)); - oparms.tcon = tcon; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, create_options); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, create_options), + .fid = &fid, + }; if (qi.flags & PASSTHRU_FSCTL) { switch (qi.info_type & FSCTL_DEVICE_ACCESS_MASK) { @@ -2088,12 +2091,13 @@ smb3_notify(const unsigned int xid, struct file *pfile, } tcon = cifs_sb_master_tcon(cifs_sb); - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = &fid, + }; rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL, NULL); @@ -2159,12 +2163,13 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, rqst[0].rq_iov = open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = fid, + }; rc = SMB2_open_init(tcon, server, &rqst[0], &oplock, &oparms, utf16_path); @@ -2490,12 +2495,13 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, rqst[0].rq_iov = open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - oparms.tcon = tcon; - oparms.desired_access = desired_access; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = desired_access, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = &fid, + }; rc = SMB2_open_init(tcon, server, &rqst[0], &oplock, &oparms, utf16_path); @@ -2623,12 +2629,13 @@ smb311_queryfs(const unsigned int xid, struct cifs_tcon *tcon, if (!tcon->posix_extensions) return smb2_queryfs(xid, tcon, cifs_sb, buf); - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = &fid, + }; rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, NULL, NULL); @@ -2916,13 +2923,13 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, rqst[0].rq_iov = open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - memset(&oparms, 0, sizeof(oparms)); - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, create_options); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, create_options), + .fid = &fid, + }; rc = SMB2_open_init(tcon, server, &rqst[0], &oplock, &oparms, utf16_path); @@ -3056,13 +3063,13 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, rqst[0].rq_iov = open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - memset(&oparms, 0, sizeof(oparms)); - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, OPEN_REPARSE_POINT); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, OPEN_REPARSE_POINT), + .fid = &fid, + }; rc = SMB2_open_init(tcon, server, &rqst[0], &oplock, &oparms, utf16_path); @@ -3196,17 +3203,20 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb, return ERR_PTR(rc); } - oparms.tcon = tcon; - oparms.desired_access = READ_CONTROL; - oparms.disposition = FILE_OPEN; - /* - * When querying an ACL, even if the file is a symlink we want to open - * the source not the target, and so the protocol requires that the - * client specify this flag when opening a reparse point - */ - oparms.create_options = cifs_create_options(cifs_sb, 0) | OPEN_REPARSE_POINT; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = READ_CONTROL, + .disposition = FILE_OPEN, + /* + * When querying an ACL, even if the file is a symlink + * we want to open the source not the target, and so + * the protocol requires that the client specify this + * flag when opening a reparse point + */ + .create_options = cifs_create_options(cifs_sb, 0) | + OPEN_REPARSE_POINT, + .fid = &fid, + }; if (info & SACL_SECINFO) oparms.desired_access |= SYSTEM_SECURITY; @@ -3265,13 +3275,14 @@ set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen, return rc; } - oparms.tcon = tcon; - oparms.desired_access = access_flags; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.disposition = FILE_OPEN; - oparms.path = path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = access_flags, + .create_options = cifs_create_options(cifs_sb, 0), + .disposition = FILE_OPEN, + .path = path, + .fid = &fid, + }; rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL, NULL); @@ -4227,8 +4238,8 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst *rqst, int num_rqst, const u8 *sig, u8 **iv, - struct aead_request **req, struct scatterlist **sgl, - unsigned int *num_sgs) + struct aead_request **req, struct sg_table *sgt, + unsigned int *num_sgs, size_t *sensitive_size) { unsigned int req_size = sizeof(**req) + crypto_aead_reqsize(tfm); unsigned int iv_size = crypto_aead_ivsize(tfm); @@ -4236,70 +4247,75 @@ static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst u8 *p; *num_sgs = cifs_get_num_sgs(rqst, num_rqst, sig); + if (IS_ERR_VALUE((long)(int)*num_sgs)) + return ERR_PTR(*num_sgs); len = iv_size; len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1); len = ALIGN(len, crypto_tfm_ctx_alignment()); len += req_size; len = ALIGN(len, __alignof__(struct scatterlist)); - len += *num_sgs * sizeof(**sgl); + len += array_size(*num_sgs, sizeof(struct scatterlist)); + *sensitive_size = len; - p = kmalloc(len, GFP_ATOMIC); + p = kvzalloc(len, GFP_NOFS); if (!p) - return NULL; + return ERR_PTR(-ENOMEM); *iv = (u8 *)PTR_ALIGN(p, crypto_aead_alignmask(tfm) + 1); *req = (struct aead_request *)PTR_ALIGN(*iv + iv_size, crypto_tfm_ctx_alignment()); - *sgl = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size, - __alignof__(struct scatterlist)); + sgt->sgl = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size, + __alignof__(struct scatterlist)); return p; } -static void *smb2_get_aead_req(struct crypto_aead *tfm, const struct smb_rqst *rqst, +static void *smb2_get_aead_req(struct crypto_aead *tfm, struct smb_rqst *rqst, int num_rqst, const u8 *sig, u8 **iv, - struct aead_request **req, struct scatterlist **sgl) + struct aead_request **req, struct scatterlist **sgl, + size_t *sensitive_size) { - unsigned int off, len, skip; - struct scatterlist *sg; - unsigned int num_sgs; - unsigned long addr; - int i, j; + struct sg_table sgtable = {}; + unsigned int skip, num_sgs, i, j; + ssize_t rc; void *p; - p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, sgl, &num_sgs); - if (!p) - return NULL; + p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, &sgtable, + &num_sgs, sensitive_size); + if (IS_ERR(p)) + return ERR_CAST(p); - sg_init_table(*sgl, num_sgs); - sg = *sgl; + sg_init_marker(sgtable.sgl, num_sgs); - /* Assumes the first rqst has a transform header as the first iov. - * I.e. - * rqst[0].rq_iov[0] is transform header - * rqst[0].rq_iov[1+] data to be encrypted/decrypted - * rqst[1+].rq_iov[0+] data to be encrypted/decrypted + /* + * The first rqst has a transform header where the + * first 20 bytes are not part of the encrypted blob. */ + skip = 20; + for (i = 0; i < num_rqst; i++) { - /* - * The first rqst has a transform header where the - * first 20 bytes are not part of the encrypted blob. - */ + struct iov_iter *iter = &rqst[i].rq_iter; + size_t count = iov_iter_count(iter); + for (j = 0; j < rqst[i].rq_nvec; j++) { - struct kvec *iov = &rqst[i].rq_iov[j]; + cifs_sg_set_buf(&sgtable, + rqst[i].rq_iov[j].iov_base + skip, + rqst[i].rq_iov[j].iov_len - skip); - skip = (i == 0) && (j == 0) ? 20 : 0; - addr = (unsigned long)iov->iov_base + skip; - len = iov->iov_len - skip; - sg = cifs_sg_set_buf(sg, (void *)addr, len); - } - for (j = 0; j < rqst[i].rq_npages; j++) { - rqst_page_get_length(&rqst[i], j, &len, &off); - sg_set_page(sg++, rqst[i].rq_pages[j], len, off); + /* See the above comment on the 'skip' assignment */ + skip = 0; } + sgtable.orig_nents = sgtable.nents; + + rc = netfs_extract_iter_to_sg(iter, count, &sgtable, + num_sgs - sgtable.nents, 0); + iov_iter_revert(iter, rc); + sgtable.orig_nents = sgtable.nents; } - cifs_sg_set_buf(sg, sig, SMB2_SIGNATURE_SIZE); + cifs_sg_set_buf(&sgtable, sig, SMB2_SIGNATURE_SIZE); + sg_mark_end(&sgtable.sgl[sgtable.nents - 1]); + *sgl = sgtable.sgl; return p; } @@ -4353,6 +4369,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, struct crypto_aead *tfm; unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); void *creq; + size_t sensitive_size; rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key); if (rc) { @@ -4386,9 +4403,10 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, return rc; } - creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg); - if (unlikely(!creq)) - return -ENOMEM; + creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg, + &sensitive_size); + if (IS_ERR(creq)) + return PTR_ERR(creq); if (!enc) { memcpy(sign, &tr_hdr->Signature, SMB2_SIGNATURE_SIZE); @@ -4416,22 +4434,35 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, if (!rc && enc) memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE); - kfree_sensitive(creq); + kvfree_sensitive(creq, sensitive_size); return rc; } +/* + * Clear a read buffer, discarding the folios which have XA_MARK_0 set. + */ +static void cifs_clear_xarray_buffer(struct xarray *buffer) +{ + struct folio *folio; + + XA_STATE(xas, buffer, 0); + + rcu_read_lock(); + xas_for_each_marked(&xas, folio, ULONG_MAX, XA_MARK_0) { + folio_put(folio); + } + rcu_read_unlock(); + xa_destroy(buffer); +} + void smb3_free_compound_rqst(int num_rqst, struct smb_rqst *rqst) { - int i, j; + int i; - for (i = 0; i < num_rqst; i++) { - if (rqst[i].rq_pages) { - for (j = rqst[i].rq_npages - 1; j >= 0; j--) - put_page(rqst[i].rq_pages[j]); - kfree(rqst[i].rq_pages); - } - } + for (i = 0; i < num_rqst; i++) + if (!xa_empty(&rqst[i].rq_buffer)) + cifs_clear_xarray_buffer(&rqst[i].rq_buffer); } /* @@ -4451,9 +4482,8 @@ static int smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *new_rq, struct smb_rqst *old_rq) { - struct page **pages; struct smb2_transform_hdr *tr_hdr = new_rq[0].rq_iov[0].iov_base; - unsigned int npages; + struct page *page; unsigned int orig_len = 0; int i, j; int rc = -ENOMEM; @@ -4461,40 +4491,45 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, for (i = 1; i < num_rqst; i++) { struct smb_rqst *old = &old_rq[i - 1]; struct smb_rqst *new = &new_rq[i]; + struct xarray *buffer = &new->rq_buffer; + size_t size = iov_iter_count(&old->rq_iter), seg, copied = 0; orig_len += smb_rqst_len(server, old); new->rq_iov = old->rq_iov; new->rq_nvec = old->rq_nvec; - npages = old->rq_npages; - if (!npages) - continue; - - pages = kmalloc_array(npages, sizeof(struct page *), - GFP_KERNEL); - if (!pages) - goto err_free; - - new->rq_pages = pages; - new->rq_npages = npages; - new->rq_offset = old->rq_offset; - new->rq_pagesz = old->rq_pagesz; - new->rq_tailsz = old->rq_tailsz; - - for (j = 0; j < npages; j++) { - pages[j] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); - if (!pages[j]) - goto err_free; - } + xa_init(buffer); + + if (size > 0) { + unsigned int npages = DIV_ROUND_UP(size, PAGE_SIZE); + + for (j = 0; j < npages; j++) { + void *o; - /* copy pages form the old */ - for (j = 0; j < npages; j++) { - unsigned int offset, len; + rc = -ENOMEM; + page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); + if (!page) + goto err_free; + page->index = j; + o = xa_store(buffer, j, page, GFP_KERNEL); + if (xa_is_err(o)) { + rc = xa_err(o); + put_page(page); + goto err_free; + } - rqst_page_get_length(new, j, &len, &offset); + xa_set_mark(buffer, j, XA_MARK_0); - memcpy_page(new->rq_pages[j], offset, - old->rq_pages[j], offset, len); + seg = min_t(size_t, size - copied, PAGE_SIZE); + if (copy_page_from_iter(page, 0, seg, &old->rq_iter) != seg) { + rc = -EFAULT; + goto err_free; + } + copied += seg; + } + iov_iter_xarray(&new->rq_iter, ITER_SOURCE, + buffer, 0, size); + new->rq_iter_size = size; } } @@ -4523,12 +4558,12 @@ smb3_is_transform_hdr(void *buf) static int decrypt_raw_data(struct TCP_Server_Info *server, char *buf, - unsigned int buf_data_size, struct page **pages, - unsigned int npages, unsigned int page_data_size, + unsigned int buf_data_size, struct iov_iter *iter, bool is_offloaded) { struct kvec iov[2]; struct smb_rqst rqst = {NULL}; + size_t iter_size = 0; int rc; iov[0].iov_base = buf; @@ -4538,10 +4573,11 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, rqst.rq_iov = iov; rqst.rq_nvec = 2; - rqst.rq_pages = pages; - rqst.rq_npages = npages; - rqst.rq_pagesz = PAGE_SIZE; - rqst.rq_tailsz = (page_data_size % PAGE_SIZE) ? : PAGE_SIZE; + if (iter) { + rqst.rq_iter = *iter; + rqst.rq_iter_size = iov_iter_count(iter); + iter_size = iov_iter_count(iter); + } rc = crypt_message(server, 1, &rqst, 0); cifs_dbg(FYI, "Decrypt message returned %d\n", rc); @@ -4552,73 +4588,37 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, memmove(buf, iov[1].iov_base, buf_data_size); if (!is_offloaded) - server->total_read = buf_data_size + page_data_size; + server->total_read = buf_data_size + iter_size; return rc; } static int -read_data_into_pages(struct TCP_Server_Info *server, struct page **pages, - unsigned int npages, unsigned int len) +cifs_copy_pages_to_iter(struct xarray *pages, unsigned int data_size, + unsigned int skip, struct iov_iter *iter) { - int i; - int length; + struct page *page; + unsigned long index; - for (i = 0; i < npages; i++) { - struct page *page = pages[i]; - size_t n; + xa_for_each(pages, index, page) { + size_t n, len = min_t(unsigned int, PAGE_SIZE - skip, data_size); - n = len; - if (len >= PAGE_SIZE) { - /* enough data to fill the page */ - n = PAGE_SIZE; - len -= n; - } else { - zero_user(page, len, PAGE_SIZE - len); - len = 0; + n = copy_page_to_iter(page, skip, len, iter); + if (n != len) { + cifs_dbg(VFS, "%s: something went wrong\n", __func__); + return -EIO; } - length = cifs_read_page_from_socket(server, page, 0, n); - if (length < 0) - return length; - server->total_read += length; + data_size -= n; + skip = 0; } return 0; } static int -init_read_bvec(struct page **pages, unsigned int npages, unsigned int data_size, - unsigned int cur_off, struct bio_vec **page_vec) -{ - struct bio_vec *bvec; - int i; - - bvec = kcalloc(npages, sizeof(struct bio_vec), GFP_KERNEL); - if (!bvec) - return -ENOMEM; - - for (i = 0; i < npages; i++) { - bvec_set_page(&bvec[i], pages[i], - min_t(unsigned int, PAGE_SIZE, data_size), - i == 0 ? cur_off : 0); - data_size -= bvec[i].bv_len; - } - - if (data_size != 0) { - cifs_dbg(VFS, "%s: something went wrong\n", __func__); - kfree(bvec); - return -EIO; - } - - *page_vec = bvec; - return 0; -} - -static int handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, - char *buf, unsigned int buf_len, struct page **pages, - unsigned int npages, unsigned int page_data_size, - bool is_offloaded) + char *buf, unsigned int buf_len, struct xarray *pages, + unsigned int pages_len, bool is_offloaded) { unsigned int data_offset; unsigned int data_len; @@ -4627,9 +4627,6 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, unsigned int pad_len; struct cifs_readdata *rdata = mid->callback_data; struct smb2_hdr *shdr = (struct smb2_hdr *)buf; - struct bio_vec *bvec = NULL; - struct iov_iter iter; - struct kvec iov; int length; bool use_rdma_mr = false; @@ -4718,7 +4715,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - if (data_len > page_data_size - pad_len) { + if (data_len > pages_len - pad_len) { /* data_len is corrupt -- discard frame */ rdata->result = -EIO; if (is_offloaded) @@ -4728,8 +4725,9 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - rdata->result = init_read_bvec(pages, npages, page_data_size, - cur_off, &bvec); + /* Copy the data to the output I/O iterator. */ + rdata->result = cifs_copy_pages_to_iter(pages, pages_len, + cur_off, &rdata->iter); if (rdata->result != 0) { if (is_offloaded) mid->mid_state = MID_RESPONSE_MALFORMED; @@ -4737,14 +4735,16 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, dequeue_mid(mid, rdata->result); return 0; } + rdata->got_bytes = pages_len; - iov_iter_bvec(&iter, ITER_SOURCE, bvec, npages, data_len); } else if (buf_len >= data_offset + data_len) { /* read response payload is in buf */ - WARN_ONCE(npages > 0, "read data can be either in buf or in pages"); - iov.iov_base = buf + data_offset; - iov.iov_len = data_len; - iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, data_len); + WARN_ONCE(pages && !xa_empty(pages), + "read data can be either in buf or in pages"); + length = copy_to_iter(buf + data_offset, data_len, &rdata->iter); + if (length < 0) + return length; + rdata->got_bytes = data_len; } else { /* read response payload cannot be in both buf and pages */ WARN_ONCE(1, "buf can not contain only a part of read data"); @@ -4756,26 +4756,18 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - length = rdata->copy_into_pages(server, rdata, &iter); - - kfree(bvec); - - if (length < 0) - return length; - if (is_offloaded) mid->mid_state = MID_RESPONSE_RECEIVED; else dequeue_mid(mid, false); - return length; + return 0; } struct smb2_decrypt_work { struct work_struct decrypt; struct TCP_Server_Info *server; - struct page **ppages; + struct xarray buffer; char *buf; - unsigned int npages; unsigned int len; }; @@ -4784,11 +4776,13 @@ static void smb2_decrypt_offload(struct work_struct *work) { struct smb2_decrypt_work *dw = container_of(work, struct smb2_decrypt_work, decrypt); - int i, rc; + int rc; struct mid_q_entry *mid; + struct iov_iter iter; + iov_iter_xarray(&iter, ITER_DEST, &dw->buffer, 0, dw->len); rc = decrypt_raw_data(dw->server, dw->buf, dw->server->vals->read_rsp_size, - dw->ppages, dw->npages, dw->len, true); + &iter, true); if (rc) { cifs_dbg(VFS, "error decrypting rc=%d\n", rc); goto free_pages; @@ -4802,7 +4796,7 @@ static void smb2_decrypt_offload(struct work_struct *work) mid->decrypted = true; rc = handle_read_data(dw->server, mid, dw->buf, dw->server->vals->read_rsp_size, - dw->ppages, dw->npages, dw->len, + &dw->buffer, dw->len, true); if (rc >= 0) { #ifdef CONFIG_CIFS_STATS2 @@ -4835,10 +4829,7 @@ static void smb2_decrypt_offload(struct work_struct *work) } free_pages: - for (i = dw->npages-1; i >= 0; i--) - put_page(dw->ppages[i]); - - kfree(dw->ppages); + cifs_clear_xarray_buffer(&dw->buffer); cifs_small_buf_release(dw->buf); kfree(dw); } @@ -4848,47 +4839,66 @@ static int receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, int *num_mids) { + struct page *page; char *buf = server->smallbuf; struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf; - unsigned int npages; - struct page **pages; - unsigned int len; + struct iov_iter iter; + unsigned int len, npages; unsigned int buflen = server->pdu_size; int rc; int i = 0; struct smb2_decrypt_work *dw; + dw = kzalloc(sizeof(struct smb2_decrypt_work), GFP_KERNEL); + if (!dw) + return -ENOMEM; + xa_init(&dw->buffer); + INIT_WORK(&dw->decrypt, smb2_decrypt_offload); + dw->server = server; + *num_mids = 1; len = min_t(unsigned int, buflen, server->vals->read_rsp_size + sizeof(struct smb2_transform_hdr)) - HEADER_SIZE(server) + 1; rc = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1, len); if (rc < 0) - return rc; + goto free_dw; server->total_read += rc; len = le32_to_cpu(tr_hdr->OriginalMessageSize) - server->vals->read_rsp_size; + dw->len = len; npages = DIV_ROUND_UP(len, PAGE_SIZE); - pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); - if (!pages) { - rc = -ENOMEM; - goto discard_data; - } - + rc = -ENOMEM; for (; i < npages; i++) { - pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); - if (!pages[i]) { - rc = -ENOMEM; + void *old; + + page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); + if (!page) + goto discard_data; + page->index = i; + old = xa_store(&dw->buffer, i, page, GFP_KERNEL); + if (xa_is_err(old)) { + rc = xa_err(old); + put_page(page); goto discard_data; } + xa_set_mark(&dw->buffer, i, XA_MARK_0); } - /* read read data into pages */ - rc = read_data_into_pages(server, pages, npages, len); - if (rc) - goto free_pages; + iov_iter_xarray(&iter, ITER_DEST, &dw->buffer, 0, npages * PAGE_SIZE); + + /* Read the data into the buffer and clear excess bufferage. */ + rc = cifs_read_iter_from_socket(server, &iter, dw->len); + if (rc < 0) + goto discard_data; + + server->total_read += rc; + if (rc < npages * PAGE_SIZE) + iov_iter_zero(npages * PAGE_SIZE - rc, &iter); + iov_iter_revert(&iter, npages * PAGE_SIZE); + iov_iter_truncate(&iter, dw->len); rc = cifs_discard_remaining_data(server); if (rc) @@ -4901,39 +4911,28 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, if ((server->min_offload) && (server->in_flight > 1) && (server->pdu_size >= server->min_offload)) { - dw = kmalloc(sizeof(struct smb2_decrypt_work), GFP_KERNEL); - if (dw == NULL) - goto non_offloaded_decrypt; - dw->buf = server->smallbuf; server->smallbuf = (char *)cifs_small_buf_get(); - INIT_WORK(&dw->decrypt, smb2_decrypt_offload); - - dw->npages = npages; - dw->server = server; - dw->ppages = pages; - dw->len = len; queue_work(decrypt_wq, &dw->decrypt); *num_mids = 0; /* worker thread takes care of finding mid */ return -1; } -non_offloaded_decrypt: rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size, - pages, npages, len, false); + &iter, false); if (rc) goto free_pages; *mid = smb2_find_mid(server, buf); - if (*mid == NULL) + if (*mid == NULL) { cifs_dbg(FYI, "mid not found\n"); - else { + } else { cifs_dbg(FYI, "mid found\n"); (*mid)->decrypted = true; rc = handle_read_data(server, *mid, buf, server->vals->read_rsp_size, - pages, npages, len, false); + &dw->buffer, dw->len, false); if (rc >= 0) { if (server->ops->is_network_name_deleted) { server->ops->is_network_name_deleted(buf, @@ -4943,9 +4942,9 @@ non_offloaded_decrypt: } free_pages: - for (i = i - 1; i >= 0; i--) - put_page(pages[i]); - kfree(pages); + cifs_clear_xarray_buffer(&dw->buffer); +free_dw: + kfree(dw); return rc; discard_data: cifs_discard_remaining_data(server); @@ -4983,7 +4982,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server, server->total_read += length; buf_size = pdu_length - sizeof(struct smb2_transform_hdr); - length = decrypt_raw_data(server, buf, buf_size, NULL, 0, 0, false); + length = decrypt_raw_data(server, buf, buf_size, NULL, false); if (length) return length; @@ -5082,7 +5081,7 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid) char *buf = server->large_buf ? server->bigbuf : server->smallbuf; return handle_read_data(server, mid, buf, server->pdu_size, - NULL, 0, 0, false); + NULL, 0, false); } static int @@ -5134,15 +5133,16 @@ smb2_make_node(unsigned int xid, struct inode *inode, cifs_dbg(FYI, "sfu compat create special file\n"); - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_WRITE; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR | - CREATE_OPTION_SPECIAL); - oparms.disposition = FILE_CREATE; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_WRITE, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR | + CREATE_OPTION_SPECIAL), + .disposition = FILE_CREATE, + .path = full_path, + .fid = &fid, + }; if (tcon->ses->server->oplocks) oplock = REQ_OPLOCK; @@ -5629,7 +5629,7 @@ struct smb_version_values smb20_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5651,7 +5651,7 @@ struct smb_version_values smb21_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5672,7 +5672,7 @@ struct smb_version_values smb3any_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5693,7 +5693,7 @@ struct smb_version_values smbdefault_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5714,7 +5714,7 @@ struct smb_version_values smb30_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5735,7 +5735,7 @@ struct smb_version_values smb302_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5756,7 +5756,7 @@ struct smb_version_values smb311_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 2c9ffa921e6f..ca9d7110ddcb 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -139,6 +139,66 @@ out: return; } +static int wait_for_server_reconnect(struct TCP_Server_Info *server, + __le16 smb2_command, bool retry) +{ + int timeout = 10; + int rc; + + spin_lock(&server->srv_lock); + if (server->tcpStatus != CifsNeedReconnect) { + spin_unlock(&server->srv_lock); + return 0; + } + timeout *= server->nr_targets; + spin_unlock(&server->srv_lock); + + /* + * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE + * here since they are implicitly done when session drops. + */ + switch (smb2_command) { + /* + * BB Should we keep oplock break and add flush to exceptions? + */ + case SMB2_TREE_DISCONNECT: + case SMB2_CANCEL: + case SMB2_CLOSE: + case SMB2_OPLOCK_BREAK: + return -EAGAIN; + } + + /* + * Give demultiplex thread up to 10 seconds to each target available for + * reconnect -- should be greater than cifs socket timeout which is 7 + * seconds. + * + * On "soft" mounts we wait once. Hard mounts keep retrying until + * process is killed or server comes back on-line. + */ + do { + rc = wait_event_interruptible_timeout(server->response_q, + (server->tcpStatus != CifsNeedReconnect), + timeout * HZ); + if (rc < 0) { + cifs_dbg(FYI, "%s: aborting reconnect due to received signal\n", + __func__); + return -ERESTARTSYS; + } + + /* are we still trying to reconnect? */ + spin_lock(&server->srv_lock); + if (server->tcpStatus != CifsNeedReconnect) { + spin_unlock(&server->srv_lock); + return 0; + } + spin_unlock(&server->srv_lock); + } while (retry); + + cifs_dbg(FYI, "%s: gave up waiting on reconnect\n", __func__); + return -EHOSTDOWN; +} + static int smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, struct TCP_Server_Info *server) @@ -146,7 +206,6 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, int rc = 0; struct nls_table *nls_codepage; struct cifs_ses *ses; - int retries; /* * SMB2s NegProt, SessSetup, Logoff do not have tcon yet so @@ -184,61 +243,11 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, (!tcon->ses->server) || !server) return -EIO; - ses = tcon->ses; - retries = server->nr_targets; - - /* - * Give demultiplex thread up to 10 seconds to each target available for - * reconnect -- should be greater than cifs socket timeout which is 7 - * seconds. - */ - while (server->tcpStatus == CifsNeedReconnect) { - /* - * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE - * here since they are implicitly done when session drops. - */ - switch (smb2_command) { - /* - * BB Should we keep oplock break and add flush to exceptions? - */ - case SMB2_TREE_DISCONNECT: - case SMB2_CANCEL: - case SMB2_CLOSE: - case SMB2_OPLOCK_BREAK: - return -EAGAIN; - } - - rc = wait_event_interruptible_timeout(server->response_q, - (server->tcpStatus != CifsNeedReconnect), - 10 * HZ); - if (rc < 0) { - cifs_dbg(FYI, "%s: aborting reconnect due to a received signal by the process\n", - __func__); - return -ERESTARTSYS; - } - - /* are we still trying to reconnect? */ - spin_lock(&server->srv_lock); - if (server->tcpStatus != CifsNeedReconnect) { - spin_unlock(&server->srv_lock); - break; - } - spin_unlock(&server->srv_lock); - - if (retries && --retries) - continue; + rc = wait_for_server_reconnect(server, smb2_command, tcon->retry); + if (rc) + return rc; - /* - * on "soft" mounts we wait once. Hard mounts keep - * retrying until process is killed or server comes - * back on-line - */ - if (!tcon->retry) { - cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n"); - return -EHOSTDOWN; - } - retries = server->nr_targets; - } + ses = tcon->ses; spin_lock(&ses->chan_lock); if (!cifs_chan_needs_reconnect(ses, server) && !tcon->need_reconnect) { @@ -1364,7 +1373,7 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) /* Testing shows that buffer offset must be at location of Buffer[0] */ req->SecurityBufferOffset = - cpu_to_le16(sizeof(struct smb2_sess_setup_req) - 1 /* pad */); + cpu_to_le16(sizeof(struct smb2_sess_setup_req)); req->SecurityBufferLength = cpu_to_le16(sess_data->iov[1].iov_len); memset(&rqst, 0, sizeof(struct smb_rqst)); @@ -1858,12 +1867,12 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, if (unc_path == NULL) return -ENOMEM; - unc_path_len = cifs_strtoUTF16(unc_path, tree, strlen(tree), cp) + 1; - unc_path_len *= 2; - if (unc_path_len < 2) { + unc_path_len = cifs_strtoUTF16(unc_path, tree, strlen(tree), cp); + if (unc_path_len <= 0) { kfree(unc_path); return -EINVAL; } + unc_path_len *= 2; /* SMB2 TREE_CONNECT request must be called with TreeId == 0 */ tcon->tid = 0; @@ -1883,9 +1892,8 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, iov[0].iov_len = total_len - 1; /* Testing shows that buffer offset must be at location of Buffer[0] */ - req->PathOffset = cpu_to_le16(sizeof(struct smb2_tree_connect_req) - - 1 /* pad */); - req->PathLength = cpu_to_le16(unc_path_len - 2); + req->PathOffset = cpu_to_le16(sizeof(struct smb2_tree_connect_req)); + req->PathLength = cpu_to_le16(unc_path_len); iov[1].iov_base = unc_path; iov[1].iov_len = unc_path_len; @@ -3764,7 +3772,7 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, ses->Suid, (u8)watch_tree, completion_filter); /* validate that notify information is plausible */ if ((rsp_iov.iov_base == NULL) || - (rsp_iov.iov_len < sizeof(struct smb2_change_notify_rsp))) + (rsp_iov.iov_len < sizeof(struct smb2_change_notify_rsp) + 1)) goto cnotify_exit; smb_rsp = (struct smb2_change_notify_rsp *)rsp_iov.iov_base; @@ -3898,7 +3906,7 @@ void smb2_reconnect_server(struct work_struct *work) goto done; /* allocate a dummy tcon struct used for reconnect */ - tcon = kzalloc(sizeof(struct cifs_tcon), GFP_KERNEL); + tcon = tconInfoAlloc(); if (!tcon) { resched = true; list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) { @@ -3921,7 +3929,7 @@ void smb2_reconnect_server(struct work_struct *work) list_del_init(&ses->rlist); cifs_put_smb_ses(ses); } - kfree(tcon); + tconInfoFree(tcon); done: cifs_dbg(FYI, "Reconnecting tcons and channels finished\n"); @@ -4054,6 +4062,36 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, return rc; } +#ifdef CONFIG_CIFS_SMB_DIRECT +static inline bool smb3_use_rdma_offload(struct cifs_io_parms *io_parms) +{ + struct TCP_Server_Info *server = io_parms->server; + struct cifs_tcon *tcon = io_parms->tcon; + + /* we can only offload if we're connected */ + if (!server || !tcon) + return false; + + /* we can only offload on an rdma connection */ + if (!server->rdma || !server->smbd_conn) + return false; + + /* we don't support signed offload yet */ + if (server->sign) + return false; + + /* we don't support encrypted offload yet */ + if (smb3_encryption_required(tcon)) + return false; + + /* offload also has its overhead, so only do it if desired */ + if (io_parms->length < server->smbd_conn->rdma_readwrite_threshold) + return false; + + return true; +} +#endif /* CONFIG_CIFS_SMB_DIRECT */ + /* * To form a chain of read requests, any read requests after the first should * have the end_of_chain boolean set to true. @@ -4097,16 +4135,12 @@ smb2_new_read_req(void **buf, unsigned int *total_len, * If we want to do a RDMA write, fill in and append * smbd_buffer_descriptor_v1 to the end of read request */ - if (server->rdma && rdata && !server->sign && - rdata->bytes >= server->smbd_conn->rdma_readwrite_threshold) { - + if (smb3_use_rdma_offload(io_parms)) { struct smbd_buffer_descriptor_v1 *v1; bool need_invalidate = server->dialect == SMB30_PROT_ID; - rdata->mr = smbd_register_mr( - server->smbd_conn, rdata->pages, - rdata->nr_pages, rdata->page_offset, - rdata->tailsz, true, need_invalidate); + rdata->mr = smbd_register_mr(server->smbd_conn, &rdata->iter, + true, need_invalidate); if (!rdata->mr) return -EAGAIN; @@ -4163,15 +4197,9 @@ smb2_readv_callback(struct mid_q_entry *mid) (struct smb2_hdr *)rdata->iov[0].iov_base; struct cifs_credits credits = { .value = 0, .instance = 0 }; struct smb_rqst rqst = { .rq_iov = &rdata->iov[1], - .rq_nvec = 1, }; - - if (rdata->got_bytes) { - rqst.rq_pages = rdata->pages; - rqst.rq_offset = rdata->page_offset; - rqst.rq_npages = rdata->nr_pages; - rqst.rq_pagesz = rdata->pagesz; - rqst.rq_tailsz = rdata->tailsz; - } + .rq_nvec = 1, + .rq_iter = rdata->iter, + .rq_iter_size = iov_iter_count(&rdata->iter), }; WARN_ONCE(rdata->server != mid->server, "rdata server %p != mid server %p", @@ -4189,6 +4217,8 @@ smb2_readv_callback(struct mid_q_entry *mid) if (server->sign && !mid->decrypted) { int rc; + iov_iter_revert(&rqst.rq_iter, rdata->got_bytes); + iov_iter_truncate(&rqst.rq_iter, rdata->got_bytes); rc = smb2_verify_signature(&rqst, server); if (rc) cifs_tcon_dbg(VFS, "SMB signature verification returned error = %d\n", @@ -4495,10 +4525,27 @@ smb2_async_writev(struct cifs_writedata *wdata, struct kvec iov[1]; struct smb_rqst rqst = { }; unsigned int total_len; + struct cifs_io_parms _io_parms; + struct cifs_io_parms *io_parms = NULL; if (!wdata->server) server = wdata->server = cifs_pick_channel(tcon->ses); + /* + * in future we may get cifs_io_parms passed in from the caller, + * but for now we construct it here... + */ + _io_parms = (struct cifs_io_parms) { + .tcon = tcon, + .server = server, + .offset = wdata->offset, + .length = wdata->bytes, + .persistent_fid = wdata->cfile->fid.persistent_fid, + .volatile_fid = wdata->cfile->fid.volatile_fid, + .pid = wdata->pid, + }; + io_parms = &_io_parms; + rc = smb2_plain_req_init(SMB2_WRITE, tcon, server, (void **) &req, &total_len); if (rc) @@ -4508,49 +4555,44 @@ smb2_async_writev(struct cifs_writedata *wdata, flags |= CIFS_TRANSFORM_REQ; shdr = (struct smb2_hdr *)req; - shdr->Id.SyncId.ProcessId = cpu_to_le32(wdata->cfile->pid); + shdr->Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid); - req->PersistentFileId = wdata->cfile->fid.persistent_fid; - req->VolatileFileId = wdata->cfile->fid.volatile_fid; + req->PersistentFileId = io_parms->persistent_fid; + req->VolatileFileId = io_parms->volatile_fid; req->WriteChannelInfoOffset = 0; req->WriteChannelInfoLength = 0; - req->Channel = 0; - req->Offset = cpu_to_le64(wdata->offset); + req->Channel = SMB2_CHANNEL_NONE; + req->Offset = cpu_to_le64(io_parms->offset); req->DataOffset = cpu_to_le16( offsetof(struct smb2_write_req, Buffer)); req->RemainingBytes = 0; - trace_smb3_write_enter(0 /* xid */, wdata->cfile->fid.persistent_fid, - tcon->tid, tcon->ses->Suid, wdata->offset, wdata->bytes); + trace_smb3_write_enter(0 /* xid */, + io_parms->persistent_fid, + io_parms->tcon->tid, + io_parms->tcon->ses->Suid, + io_parms->offset, + io_parms->length); + #ifdef CONFIG_CIFS_SMB_DIRECT /* * If we want to do a server RDMA read, fill in and append * smbd_buffer_descriptor_v1 to the end of write request */ - if (server->rdma && !server->sign && wdata->bytes >= - server->smbd_conn->rdma_readwrite_threshold) { - + if (smb3_use_rdma_offload(io_parms)) { struct smbd_buffer_descriptor_v1 *v1; + size_t data_size = iov_iter_count(&wdata->iter); bool need_invalidate = server->dialect == SMB30_PROT_ID; - wdata->mr = smbd_register_mr( - server->smbd_conn, wdata->pages, - wdata->nr_pages, wdata->page_offset, - wdata->tailsz, false, need_invalidate); + wdata->mr = smbd_register_mr(server->smbd_conn, &wdata->iter, + false, need_invalidate); if (!wdata->mr) { rc = -EAGAIN; goto async_writev_out; } req->Length = 0; req->DataOffset = 0; - if (wdata->nr_pages > 1) - req->RemainingBytes = - cpu_to_le32( - (wdata->nr_pages - 1) * wdata->pagesz - - wdata->page_offset + wdata->tailsz - ); - else - req->RemainingBytes = cpu_to_le32(wdata->tailsz); + req->RemainingBytes = cpu_to_le32(data_size); req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE; if (need_invalidate) req->Channel = SMB2_CHANNEL_RDMA_V1; @@ -4569,26 +4611,21 @@ smb2_async_writev(struct cifs_writedata *wdata, rqst.rq_iov = iov; rqst.rq_nvec = 1; - rqst.rq_pages = wdata->pages; - rqst.rq_offset = wdata->page_offset; - rqst.rq_npages = wdata->nr_pages; - rqst.rq_pagesz = wdata->pagesz; - rqst.rq_tailsz = wdata->tailsz; + rqst.rq_iter = wdata->iter; + rqst.rq_iter_size = iov_iter_count(&rqst.rq_iter); #ifdef CONFIG_CIFS_SMB_DIRECT - if (wdata->mr) { + if (wdata->mr) iov[0].iov_len += sizeof(struct smbd_buffer_descriptor_v1); - rqst.rq_npages = 0; - } #endif - cifs_dbg(FYI, "async write at %llu %u bytes\n", - wdata->offset, wdata->bytes); + cifs_dbg(FYI, "async write at %llu %u bytes iter=%zx\n", + io_parms->offset, io_parms->length, iov_iter_count(&rqst.rq_iter)); #ifdef CONFIG_CIFS_SMB_DIRECT /* For RDMA read, I/O size is in RemainingBytes not in Length */ if (!wdata->mr) - req->Length = cpu_to_le32(wdata->bytes); + req->Length = cpu_to_le32(io_parms->length); #else - req->Length = cpu_to_le32(wdata->bytes); + req->Length = cpu_to_le32(io_parms->length); #endif if (wdata->credits.value > 0) { @@ -4596,7 +4633,7 @@ smb2_async_writev(struct cifs_writedata *wdata, SMB2_MAX_BUFFER_SIZE)); shdr->CreditRequest = cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 8); - rc = adjust_credits(server, &wdata->credits, wdata->bytes); + rc = adjust_credits(server, &wdata->credits, io_parms->length); if (rc) goto async_writev_out; @@ -4609,9 +4646,12 @@ smb2_async_writev(struct cifs_writedata *wdata, if (rc) { trace_smb3_write_err(0 /* no xid */, - req->PersistentFileId, - tcon->tid, tcon->ses->Suid, wdata->offset, - wdata->bytes, rc); + io_parms->persistent_fid, + io_parms->tcon->tid, + io_parms->tcon->ses->Suid, + io_parms->offset, + io_parms->length, + rc); kref_put(&wdata->refcount, release); cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); } @@ -4906,7 +4946,7 @@ int SMB2_query_directory_init(const unsigned int xid, memcpy(bufptr, &asteriks, len); req->FileNameOffset = - cpu_to_le16(sizeof(struct smb2_query_directory_req) - 1); + cpu_to_le16(sizeof(struct smb2_query_directory_req)); req->FileNameLength = cpu_to_le16(len); /* * BB could be 30 bytes or so longer if we used SMB2 specific @@ -4951,10 +4991,10 @@ smb2_parse_query_directory(struct cifs_tcon *tcon, switch (srch_inf->info_level) { case SMB_FIND_FILE_DIRECTORY_INFO: - info_buf_size = sizeof(FILE_DIRECTORY_INFO) - 1; + info_buf_size = sizeof(FILE_DIRECTORY_INFO); break; case SMB_FIND_FILE_ID_FULL_DIR_INFO: - info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1; + info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO); break; case SMB_FIND_FILE_POSIX_INFO: /* note that posix payload are variable size */ @@ -5102,8 +5142,7 @@ SMB2_set_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, req->VolatileFileId = volatile_fid; req->AdditionalInformation = cpu_to_le32(additional_info); - req->BufferOffset = - cpu_to_le16(sizeof(struct smb2_set_info_req) - 1); + req->BufferOffset = cpu_to_le16(sizeof(struct smb2_set_info_req)); req->BufferLength = cpu_to_le32(*size); memcpy(req->Buffer, *data, *size); @@ -5337,9 +5376,9 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, req->VolatileFileId = volatile_fid; /* 1 for pad */ req->InputBufferOffset = - cpu_to_le16(sizeof(struct smb2_query_info_req) - 1); + cpu_to_le16(sizeof(struct smb2_query_info_req)); req->OutputBufferLength = cpu_to_le32( - outbuf_len + sizeof(struct smb2_query_info_rsp) - 1); + outbuf_len + sizeof(struct smb2_query_info_rsp)); iov->iov_base = (char *)req; iov->iov_len = total_len; diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 1237bb86e93a..2114e8a0c63a 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -57,7 +57,7 @@ struct smb2_rdma_crypto_transform { #define COMPOUND_FID 0xFFFFFFFFFFFFFFFFULL #define SMB2_SYMLINK_STRUCT_SIZE \ - (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp)) + (sizeof(struct smb2_err_rsp) + sizeof(struct smb2_symlink_err_rsp)) #define SYMLINK_ERROR_TAG 0x4c4d5953 @@ -371,7 +371,7 @@ struct smb2_file_id_extd_directory_info { __le32 EaSize; /* EA size */ __le32 ReparsePointTag; /* valid if FILE_ATTR_REPARSE_POINT set in FileAttributes */ __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit */ - char FileName[1]; + char FileName[]; } __packed; /* level 60 */ extern char smb2_padding[7]; diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 8c816b25ce7c..55b6e319a61d 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -34,16 +34,21 @@ static int smbd_post_recv( struct smbd_response *response); static int smbd_post_send_empty(struct smbd_connection *info); -static int smbd_post_send_data( - struct smbd_connection *info, - struct kvec *iov, int n_vec, int remaining_data_length); -static int smbd_post_send_page(struct smbd_connection *info, - struct page *page, unsigned long offset, - size_t size, int remaining_data_length); static void destroy_mr_list(struct smbd_connection *info); static int allocate_mr_list(struct smbd_connection *info); +struct smb_extract_to_rdma { + struct ib_sge *sge; + unsigned int nr_sge; + unsigned int max_sge; + struct ib_device *device; + u32 local_dma_lkey; + enum dma_data_direction direction; +}; +static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len, + struct smb_extract_to_rdma *rdma); + /* SMBD version number */ #define SMBD_V1 0x0100 @@ -823,16 +828,16 @@ static int smbd_post_send(struct smbd_connection *info, return rc; } -static int smbd_post_send_sgl(struct smbd_connection *info, - struct scatterlist *sgl, int data_length, int remaining_data_length) +static int smbd_post_send_iter(struct smbd_connection *info, + struct iov_iter *iter, + int *_remaining_data_length) { - int num_sgs; int i, rc; int header_length; + int data_length; struct smbd_request *request; struct smbd_data_transfer *packet; int new_credits; - struct scatterlist *sg; wait_credit: /* Wait for send credits. A SMBD packet needs one credit */ @@ -876,6 +881,30 @@ wait_send_queue: } request->info = info; + memset(request->sge, 0, sizeof(request->sge)); + + /* Fill in the data payload to find out how much data we can add */ + if (iter) { + struct smb_extract_to_rdma extract = { + .nr_sge = 1, + .max_sge = SMBDIRECT_MAX_SEND_SGE, + .sge = request->sge, + .device = info->id->device, + .local_dma_lkey = info->pd->local_dma_lkey, + .direction = DMA_TO_DEVICE, + }; + + rc = smb_extract_iter_to_rdma(iter, *_remaining_data_length, + &extract); + if (rc < 0) + goto err_dma; + data_length = rc; + request->num_sge = extract.nr_sge; + *_remaining_data_length -= data_length; + } else { + data_length = 0; + request->num_sge = 1; + } /* Fill in the packet header */ packet = smbd_request_payload(request); @@ -897,7 +926,7 @@ wait_send_queue: else packet->data_offset = cpu_to_le32(24); packet->data_length = cpu_to_le32(data_length); - packet->remaining_data_length = cpu_to_le32(remaining_data_length); + packet->remaining_data_length = cpu_to_le32(*_remaining_data_length); packet->padding = 0; log_outgoing(INFO, "credits_requested=%d credits_granted=%d data_offset=%d data_length=%d remaining_data_length=%d\n", @@ -913,7 +942,6 @@ wait_send_queue: if (!data_length) header_length = offsetof(struct smbd_data_transfer, padding); - request->num_sge = 1; request->sge[0].addr = ib_dma_map_single(info->id->device, (void *)packet, header_length, @@ -927,23 +955,6 @@ wait_send_queue: request->sge[0].length = header_length; request->sge[0].lkey = info->pd->local_dma_lkey; - /* Fill in the packet data payload */ - num_sgs = sgl ? sg_nents(sgl) : 0; - for_each_sg(sgl, sg, num_sgs, i) { - request->sge[i+1].addr = - ib_dma_map_page(info->id->device, sg_page(sg), - sg->offset, sg->length, DMA_TO_DEVICE); - if (ib_dma_mapping_error( - info->id->device, request->sge[i+1].addr)) { - rc = -EIO; - request->sge[i+1].addr = 0; - goto err_dma; - } - request->sge[i+1].length = sg->length; - request->sge[i+1].lkey = info->pd->local_dma_lkey; - request->num_sge++; - } - rc = smbd_post_send(info, request); if (!rc) return 0; @@ -976,61 +987,16 @@ err_wait_credit: } /* - * Send a page - * page: the page to send - * offset: offset in the page to send - * size: length in the page to send - * remaining_data_length: remaining data to send in this payload - */ -static int smbd_post_send_page(struct smbd_connection *info, struct page *page, - unsigned long offset, size_t size, int remaining_data_length) -{ - struct scatterlist sgl; - - sg_init_table(&sgl, 1); - sg_set_page(&sgl, page, size, offset); - - return smbd_post_send_sgl(info, &sgl, size, remaining_data_length); -} - -/* * Send an empty message * Empty message is used to extend credits to peer to for keep live * while there is no upper layer payload to send at the time */ static int smbd_post_send_empty(struct smbd_connection *info) { - info->count_send_empty++; - return smbd_post_send_sgl(info, NULL, 0, 0); -} - -/* - * Send a data buffer - * iov: the iov array describing the data buffers - * n_vec: number of iov array - * remaining_data_length: remaining data to send following this packet - * in segmented SMBD packet - */ -static int smbd_post_send_data( - struct smbd_connection *info, struct kvec *iov, int n_vec, - int remaining_data_length) -{ - int i; - u32 data_length = 0; - struct scatterlist sgl[SMBDIRECT_MAX_SEND_SGE - 1]; - - if (n_vec > SMBDIRECT_MAX_SEND_SGE - 1) { - cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec); - return -EINVAL; - } + int remaining_data_length = 0; - sg_init_table(sgl, n_vec); - for (i = 0; i < n_vec; i++) { - data_length += iov[i].iov_len; - sg_set_buf(&sgl[i], iov[i].iov_base, iov[i].iov_len); - } - - return smbd_post_send_sgl(info, sgl, data_length, remaining_data_length); + info->count_send_empty++; + return smbd_post_send_iter(info, NULL, &remaining_data_length); } /* @@ -1700,6 +1666,7 @@ static struct smbd_connection *_smbd_get_connection( allocate_mr_failed: /* At this point, need to a full transport shutdown */ + server->smbd_conn = info; smbd_destroy(server); return NULL; @@ -1985,18 +1952,10 @@ int smbd_send(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *rqst_array) { struct smbd_connection *info = server->smbd_conn; - struct kvec vecs[SMBDIRECT_MAX_SEND_SGE - 1]; - int nvecs; - int size; - unsigned int buflen, remaining_data_length; - unsigned int offset, remaining_vec_data_length; - int start, i, j; - int max_iov_size = - info->max_send_size - sizeof(struct smbd_data_transfer); - struct kvec *iov; - int rc; struct smb_rqst *rqst; - int rqst_idx; + struct iov_iter iter; + unsigned int remaining_data_length, klen; + int rc, i, rqst_idx; if (info->transport_status != SMBD_CONNECTED) return -EAGAIN; @@ -2023,84 +1982,36 @@ int smbd_send(struct TCP_Server_Info *server, rqst_idx = 0; do { rqst = &rqst_array[rqst_idx]; - iov = rqst->rq_iov; cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n", - rqst_idx, smb_rqst_len(server, rqst)); - remaining_vec_data_length = 0; - for (i = 0; i < rqst->rq_nvec; i++) { - remaining_vec_data_length += iov[i].iov_len; - dump_smb(iov[i].iov_base, iov[i].iov_len); - } - - log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d rq_tailsz=%d buflen=%lu\n", - rqst_idx, rqst->rq_nvec, - rqst->rq_npages, rqst->rq_pagesz, - rqst->rq_tailsz, smb_rqst_len(server, rqst)); - - start = 0; - offset = 0; - do { - buflen = 0; - i = start; - j = 0; - while (i < rqst->rq_nvec && - j < SMBDIRECT_MAX_SEND_SGE - 1 && - buflen < max_iov_size) { - - vecs[j].iov_base = iov[i].iov_base + offset; - if (buflen + iov[i].iov_len > max_iov_size) { - vecs[j].iov_len = - max_iov_size - iov[i].iov_len; - buflen = max_iov_size; - offset = vecs[j].iov_len; - } else { - vecs[j].iov_len = - iov[i].iov_len - offset; - buflen += vecs[j].iov_len; - offset = 0; - ++i; - } - ++j; - } + rqst_idx, smb_rqst_len(server, rqst)); + for (i = 0; i < rqst->rq_nvec; i++) + dump_smb(rqst->rq_iov[i].iov_base, rqst->rq_iov[i].iov_len); + + log_write(INFO, "RDMA-WR[%u] nvec=%d len=%u iter=%zu rqlen=%lu\n", + rqst_idx, rqst->rq_nvec, remaining_data_length, + iov_iter_count(&rqst->rq_iter), smb_rqst_len(server, rqst)); + + /* Send the metadata pages. */ + klen = 0; + for (i = 0; i < rqst->rq_nvec; i++) + klen += rqst->rq_iov[i].iov_len; + iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen); + + rc = smbd_post_send_iter(info, &iter, &remaining_data_length); + if (rc < 0) + break; - remaining_vec_data_length -= buflen; - remaining_data_length -= buflen; - log_write(INFO, "sending %s iov[%d] from start=%d nvecs=%d remaining_data_length=%d\n", - remaining_vec_data_length > 0 ? - "partial" : "complete", - rqst->rq_nvec, start, j, - remaining_data_length); - - start = i; - rc = smbd_post_send_data(info, vecs, j, remaining_data_length); - if (rc) - goto done; - } while (remaining_vec_data_length > 0); - - /* now sending pages if there are any */ - for (i = 0; i < rqst->rq_npages; i++) { - rqst_page_get_length(rqst, i, &buflen, &offset); - nvecs = (buflen + max_iov_size - 1) / max_iov_size; - log_write(INFO, "sending pages buflen=%d nvecs=%d\n", - buflen, nvecs); - for (j = 0; j < nvecs; j++) { - size = min_t(unsigned int, max_iov_size, remaining_data_length); - remaining_data_length -= size; - log_write(INFO, "sending pages i=%d offset=%d size=%d remaining_data_length=%d\n", - i, j * max_iov_size + offset, size, - remaining_data_length); - rc = smbd_post_send_page( - info, rqst->rq_pages[i], - j*max_iov_size + offset, - size, remaining_data_length); - if (rc) - goto done; - } + if (iov_iter_count(&rqst->rq_iter) > 0) { + /* And then the data pages if there are any */ + rc = smbd_post_send_iter(info, &rqst->rq_iter, + &remaining_data_length); + if (rc < 0) + break; } + } while (++rqst_idx < num_rqst); -done: /* * As an optimization, we don't wait for individual I/O to finish * before sending the next one. @@ -2191,10 +2102,10 @@ static void destroy_mr_list(struct smbd_connection *info) cancel_work_sync(&info->mr_recovery_work); list_for_each_entry_safe(mr, tmp, &info->mr_list, list) { if (mr->state == MR_INVALIDATED) - ib_dma_unmap_sg(info->id->device, mr->sgl, - mr->sgl_count, mr->dir); + ib_dma_unmap_sg(info->id->device, mr->sgt.sgl, + mr->sgt.nents, mr->dir); ib_dereg_mr(mr->mr); - kfree(mr->sgl); + kfree(mr->sgt.sgl); kfree(mr); } } @@ -2217,6 +2128,7 @@ static int allocate_mr_list(struct smbd_connection *info) atomic_set(&info->mr_ready_count, 0); atomic_set(&info->mr_used_count, 0); init_waitqueue_head(&info->wait_for_mr_cleanup); + INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work); /* Allocate more MRs (2x) than hardware responder_resources */ for (i = 0; i < info->responder_resources * 2; i++) { smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL); @@ -2229,11 +2141,10 @@ static int allocate_mr_list(struct smbd_connection *info) info->mr_type, info->max_frmr_depth); goto out; } - smbdirect_mr->sgl = kcalloc( - info->max_frmr_depth, - sizeof(struct scatterlist), - GFP_KERNEL); - if (!smbdirect_mr->sgl) { + smbdirect_mr->sgt.sgl = kcalloc(info->max_frmr_depth, + sizeof(struct scatterlist), + GFP_KERNEL); + if (!smbdirect_mr->sgt.sgl) { log_rdma_mr(ERR, "failed to allocate sgl\n"); ib_dereg_mr(smbdirect_mr->mr); goto out; @@ -2244,15 +2155,15 @@ static int allocate_mr_list(struct smbd_connection *info) list_add_tail(&smbdirect_mr->list, &info->mr_list); atomic_inc(&info->mr_ready_count); } - INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work); return 0; out: kfree(smbdirect_mr); list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) { + list_del(&smbdirect_mr->list); ib_dereg_mr(smbdirect_mr->mr); - kfree(smbdirect_mr->sgl); + kfree(smbdirect_mr->sgt.sgl); kfree(smbdirect_mr); } return -ENOMEM; @@ -2305,26 +2216,45 @@ again: } /* + * Transcribe the pages from an iterator into an MR scatterlist. + */ +static int smbd_iter_to_mr(struct smbd_connection *info, + struct iov_iter *iter, + struct sg_table *sgt, + unsigned int max_sg) +{ + int ret; + + memset(sgt->sgl, 0, max_sg * sizeof(struct scatterlist)); + + ret = netfs_extract_iter_to_sg(iter, iov_iter_count(iter), sgt, max_sg, 0); + WARN_ON(ret < 0); + if (sgt->nents > 0) + sg_mark_end(&sgt->sgl[sgt->nents - 1]); + return ret; +} + +/* * Register memory for RDMA read/write - * pages[]: the list of pages to register memory with - * num_pages: the number of pages to register - * tailsz: if non-zero, the bytes to register in the last page + * iter: the buffer to register memory with * writing: true if this is a RDMA write (SMB read), false for RDMA read * need_invalidate: true if this MR needs to be locally invalidated after I/O * return value: the MR registered, NULL if failed. */ -struct smbd_mr *smbd_register_mr( - struct smbd_connection *info, struct page *pages[], int num_pages, - int offset, int tailsz, bool writing, bool need_invalidate) +struct smbd_mr *smbd_register_mr(struct smbd_connection *info, + struct iov_iter *iter, + bool writing, bool need_invalidate) { struct smbd_mr *smbdirect_mr; - int rc, i; + int rc, num_pages; enum dma_data_direction dir; struct ib_reg_wr *reg_wr; + num_pages = iov_iter_npages(iter, info->max_frmr_depth + 1); if (num_pages > info->max_frmr_depth) { log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n", num_pages, info->max_frmr_depth); + WARN_ON_ONCE(1); return NULL; } @@ -2333,45 +2263,31 @@ struct smbd_mr *smbd_register_mr( log_rdma_mr(ERR, "get_mr returning NULL\n"); return NULL; } - smbdirect_mr->need_invalidate = need_invalidate; - smbdirect_mr->sgl_count = num_pages; - sg_init_table(smbdirect_mr->sgl, num_pages); - - log_rdma_mr(INFO, "num_pages=0x%x offset=0x%x tailsz=0x%x\n", - num_pages, offset, tailsz); - - if (num_pages == 1) { - sg_set_page(&smbdirect_mr->sgl[0], pages[0], tailsz, offset); - goto skip_multiple_pages; - } - /* We have at least two pages to register */ - sg_set_page( - &smbdirect_mr->sgl[0], pages[0], PAGE_SIZE - offset, offset); - i = 1; - while (i < num_pages - 1) { - sg_set_page(&smbdirect_mr->sgl[i], pages[i], PAGE_SIZE, 0); - i++; - } - sg_set_page(&smbdirect_mr->sgl[i], pages[i], - tailsz ? tailsz : PAGE_SIZE, 0); - -skip_multiple_pages: dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; smbdirect_mr->dir = dir; - rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgl, num_pages, dir); + smbdirect_mr->need_invalidate = need_invalidate; + smbdirect_mr->sgt.nents = 0; + smbdirect_mr->sgt.orig_nents = 0; + + log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n", + num_pages, iov_iter_count(iter), info->max_frmr_depth); + smbd_iter_to_mr(info, iter, &smbdirect_mr->sgt, info->max_frmr_depth); + + rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgt.sgl, + smbdirect_mr->sgt.nents, dir); if (!rc) { log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n", num_pages, dir, rc); goto dma_map_error; } - rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgl, num_pages, - NULL, PAGE_SIZE); - if (rc != num_pages) { + rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgt.sgl, + smbdirect_mr->sgt.nents, NULL, PAGE_SIZE); + if (rc != smbdirect_mr->sgt.nents) { log_rdma_mr(ERR, - "ib_map_mr_sg failed rc = %d num_pages = %x\n", - rc, num_pages); + "ib_map_mr_sg failed rc = %d nents = %x\n", + rc, smbdirect_mr->sgt.nents); goto map_mr_error; } @@ -2403,8 +2319,8 @@ skip_multiple_pages: /* If all failed, attempt to recover this MR by setting it MR_ERROR*/ map_mr_error: - ib_dma_unmap_sg(info->id->device, smbdirect_mr->sgl, - smbdirect_mr->sgl_count, smbdirect_mr->dir); + ib_dma_unmap_sg(info->id->device, smbdirect_mr->sgt.sgl, + smbdirect_mr->sgt.nents, smbdirect_mr->dir); dma_map_error: smbdirect_mr->state = MR_ERROR; @@ -2471,8 +2387,8 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr) if (smbdirect_mr->state == MR_INVALIDATED) { ib_dma_unmap_sg( - info->id->device, smbdirect_mr->sgl, - smbdirect_mr->sgl_count, + info->id->device, smbdirect_mr->sgt.sgl, + smbdirect_mr->sgt.nents, smbdirect_mr->dir); smbdirect_mr->state = MR_READY; if (atomic_inc_return(&info->mr_ready_count) == 1) @@ -2490,3 +2406,206 @@ done: return rc; } + +static bool smb_set_sge(struct smb_extract_to_rdma *rdma, + struct page *lowest_page, size_t off, size_t len) +{ + struct ib_sge *sge = &rdma->sge[rdma->nr_sge]; + u64 addr; + + addr = ib_dma_map_page(rdma->device, lowest_page, + off, len, rdma->direction); + if (ib_dma_mapping_error(rdma->device, addr)) + return false; + + sge->addr = addr; + sge->length = len; + sge->lkey = rdma->local_dma_lkey; + rdma->nr_sge++; + return true; +} + +/* + * Extract page fragments from a BVEC-class iterator and add them to an RDMA + * element list. The pages are not pinned. + */ +static ssize_t smb_extract_bvec_to_rdma(struct iov_iter *iter, + struct smb_extract_to_rdma *rdma, + ssize_t maxsize) +{ + const struct bio_vec *bv = iter->bvec; + unsigned long start = iter->iov_offset; + unsigned int i; + ssize_t ret = 0; + + for (i = 0; i < iter->nr_segs; i++) { + size_t off, len; + + len = bv[i].bv_len; + if (start >= len) { + start -= len; + continue; + } + + len = min_t(size_t, maxsize, len - start); + off = bv[i].bv_offset + start; + + if (!smb_set_sge(rdma, bv[i].bv_page, off, len)) + return -EIO; + + ret += len; + maxsize -= len; + if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0) + break; + start = 0; + } + + return ret; +} + +/* + * Extract fragments from a KVEC-class iterator and add them to an RDMA list. + * This can deal with vmalloc'd buffers as well as kmalloc'd or static buffers. + * The pages are not pinned. + */ +static ssize_t smb_extract_kvec_to_rdma(struct iov_iter *iter, + struct smb_extract_to_rdma *rdma, + ssize_t maxsize) +{ + const struct kvec *kv = iter->kvec; + unsigned long start = iter->iov_offset; + unsigned int i; + ssize_t ret = 0; + + for (i = 0; i < iter->nr_segs; i++) { + struct page *page; + unsigned long kaddr; + size_t off, len, seg; + + len = kv[i].iov_len; + if (start >= len) { + start -= len; + continue; + } + + kaddr = (unsigned long)kv[i].iov_base + start; + off = kaddr & ~PAGE_MASK; + len = min_t(size_t, maxsize, len - start); + kaddr &= PAGE_MASK; + + maxsize -= len; + do { + seg = min_t(size_t, len, PAGE_SIZE - off); + + if (is_vmalloc_or_module_addr((void *)kaddr)) + page = vmalloc_to_page((void *)kaddr); + else + page = virt_to_page(kaddr); + + if (!smb_set_sge(rdma, page, off, seg)) + return -EIO; + + ret += seg; + len -= seg; + kaddr += PAGE_SIZE; + off = 0; + } while (len > 0 && rdma->nr_sge < rdma->max_sge); + + if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0) + break; + start = 0; + } + + return ret; +} + +/* + * Extract folio fragments from an XARRAY-class iterator and add them to an + * RDMA list. The folios are not pinned. + */ +static ssize_t smb_extract_xarray_to_rdma(struct iov_iter *iter, + struct smb_extract_to_rdma *rdma, + ssize_t maxsize) +{ + struct xarray *xa = iter->xarray; + struct folio *folio; + loff_t start = iter->xarray_start + iter->iov_offset; + pgoff_t index = start / PAGE_SIZE; + ssize_t ret = 0; + size_t off, len; + XA_STATE(xas, xa, index); + + rcu_read_lock(); + + xas_for_each(&xas, folio, ULONG_MAX) { + if (xas_retry(&xas, folio)) + continue; + if (WARN_ON(xa_is_value(folio))) + break; + if (WARN_ON(folio_test_hugetlb(folio))) + break; + + off = offset_in_folio(folio, start); + len = min_t(size_t, maxsize, folio_size(folio) - off); + + if (!smb_set_sge(rdma, folio_page(folio, 0), off, len)) { + rcu_read_unlock(); + return -EIO; + } + + maxsize -= len; + ret += len; + if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0) + break; + } + + rcu_read_unlock(); + return ret; +} + +/* + * Extract page fragments from up to the given amount of the source iterator + * and build up an RDMA list that refers to all of those bits. The RDMA list + * is appended to, up to the maximum number of elements set in the parameter + * block. + * + * The extracted page fragments are not pinned or ref'd in any way; if an + * IOVEC/UBUF-type iterator is to be used, it should be converted to a + * BVEC-type iterator and the pages pinned, ref'd or otherwise held in some + * way. + */ +static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len, + struct smb_extract_to_rdma *rdma) +{ + ssize_t ret; + int before = rdma->nr_sge; + + switch (iov_iter_type(iter)) { + case ITER_BVEC: + ret = smb_extract_bvec_to_rdma(iter, rdma, len); + break; + case ITER_KVEC: + ret = smb_extract_kvec_to_rdma(iter, rdma, len); + break; + case ITER_XARRAY: + ret = smb_extract_xarray_to_rdma(iter, rdma, len); + break; + default: + WARN_ON_ONCE(1); + return -EIO; + } + + if (ret > 0) { + iov_iter_advance(iter, ret); + } else if (ret < 0) { + while (rdma->nr_sge > before) { + struct ib_sge *sge = &rdma->sge[rdma->nr_sge--]; + + ib_dma_unmap_single(rdma->device, sge->addr, sge->length, + rdma->direction); + sge->addr = 0; + } + } + + return ret; +} diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h index 207ef979cd51..83f239f376f0 100644 --- a/fs/cifs/smbdirect.h +++ b/fs/cifs/smbdirect.h @@ -288,8 +288,7 @@ struct smbd_mr { struct list_head list; enum mr_state state; struct ib_mr *mr; - struct scatterlist *sgl; - int sgl_count; + struct sg_table sgt; enum dma_data_direction dir; union { struct ib_reg_wr wr; @@ -302,8 +301,8 @@ struct smbd_mr { /* Interfaces to register and deregister MR for RDMA read/write */ struct smbd_mr *smbd_register_mr( - struct smbd_connection *info, struct page *pages[], int num_pages, - int offset, int tailsz, bool writing, bool need_invalidate); + struct smbd_connection *info, struct iov_iter *iter, + bool writing, bool need_invalidate); int smbd_deregister_mr(struct smbd_mr *mr); #else diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 3851d0aaa288..b42050c68e6c 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -270,26 +270,7 @@ smb_rqst_len(struct TCP_Server_Info *server, struct smb_rqst *rqst) for (i = 0; i < nvec; i++) buflen += iov[i].iov_len; - /* - * Add in the page array if there is one. The caller needs to make - * sure rq_offset and rq_tailsz are set correctly. If a buffer of - * multiple pages ends at page boundary, rq_tailsz needs to be set to - * PAGE_SIZE. - */ - if (rqst->rq_npages) { - if (rqst->rq_npages == 1) - buflen += rqst->rq_tailsz; - else { - /* - * If there is more than one page, calculate the - * buffer length based on rq_offset and rq_tailsz - */ - buflen += rqst->rq_pagesz * (rqst->rq_npages - 1) - - rqst->rq_offset; - buflen += rqst->rq_tailsz; - } - } - + buflen += iov_iter_count(&rqst->rq_iter); return buflen; } @@ -376,23 +357,15 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, total_len += sent; - /* now walk the page array and send each page in it */ - for (i = 0; i < rqst[j].rq_npages; i++) { - struct bio_vec bvec; - - bvec.bv_page = rqst[j].rq_pages[i]; - rqst_page_get_length(&rqst[j], i, &bvec.bv_len, - &bvec.bv_offset); - - iov_iter_bvec(&smb_msg.msg_iter, ITER_SOURCE, - &bvec, 1, bvec.bv_len); + if (iov_iter_count(&rqst[j].rq_iter) > 0) { + smb_msg.msg_iter = rqst[j].rq_iter; rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) break; - total_len += sent; } - } + +} unmask: sigprocmask(SIG_SETMASK, &oldmask, NULL); @@ -1034,15 +1007,40 @@ cifs_cancelled_callback(struct mid_q_entry *mid) struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) { uint index = 0; + unsigned int min_in_flight = UINT_MAX, max_in_flight = 0; + struct TCP_Server_Info *server = NULL; + int i; if (!ses) return NULL; - /* round robin */ - index = (uint)atomic_inc_return(&ses->chan_seq); - spin_lock(&ses->chan_lock); - index %= ses->chan_count; + for (i = 0; i < ses->chan_count; i++) { + server = ses->chans[i].server; + if (!server) + continue; + + /* + * strictly speaking, we should pick up req_lock to read + * server->in_flight. But it shouldn't matter much here if we + * race while reading this data. The worst that can happen is + * that we could use a channel that's not least loaded. Avoiding + * taking the lock could help reduce wait time, which is + * important for this function + */ + if (server->in_flight < min_in_flight) { + min_in_flight = server->in_flight; + index = i; + } + if (server->in_flight > max_in_flight) + max_in_flight = server->in_flight; + } + + /* if all channels are equally loaded, fall back to round-robin */ + if (min_in_flight == max_in_flight) { + index = (uint)atomic_inc_return(&ses->chan_seq); + index %= ses->chan_count; + } spin_unlock(&ses->chan_lock); return ses->chans[index].server; @@ -1640,11 +1638,11 @@ int cifs_discard_remaining_data(struct TCP_Server_Info *server) { unsigned int rfclen = server->pdu_size; - int remaining = rfclen + HEADER_PREAMBLE_SIZE(server) - + size_t remaining = rfclen + HEADER_PREAMBLE_SIZE(server) - server->total_read; while (remaining > 0) { - int length; + ssize_t length; length = cifs_discard_from_socket(server, min_t(size_t, remaining, @@ -1790,10 +1788,15 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) return cifs_readv_discard(server, mid); } - length = rdata->read_into_pages(server, rdata, data_len); - if (length < 0) - return length; - +#ifdef CONFIG_CIFS_SMB_DIRECT + if (rdata->mr) + length = data_len; /* An RDMA read is already done. */ + else +#endif + length = cifs_read_iter_from_socket(server, &rdata->iter, + data_len); + if (length > 0) + rdata->got_bytes += length; server->total_read += length; cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n", diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c index e401302478c3..aed7704a0672 100644 --- a/fs/ksmbd/smb2ops.c +++ b/fs/ksmbd/smb2ops.c @@ -26,7 +26,7 @@ static struct smb_version_values smb21_server_values = { .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -52,7 +52,7 @@ static struct smb_version_values smb30_server_values = { .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -79,7 +79,7 @@ static struct smb_version_values smb302_server_values = { .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -106,7 +106,7 @@ static struct smb_version_values smb311_server_values = { .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 013fd6452942..0685c1c77b9f 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -274,8 +274,7 @@ int init_smb2_neg_rsp(struct ksmbd_work *work) le16_to_cpu(rsp->SecurityBufferOffset)); inc_rfc1001_len(work->response_buf, sizeof(struct smb2_negotiate_rsp) - - sizeof(struct smb2_hdr) - sizeof(rsp->Buffer) + - AUTH_GSS_LENGTH); + sizeof(struct smb2_hdr) + AUTH_GSS_LENGTH); rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE; if (server_conf.signing == KSMBD_CONFIG_OPT_MANDATORY) rsp->SecurityMode |= SMB2_NEGOTIATE_SIGNING_REQUIRED_LE; @@ -1205,8 +1204,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) ksmbd_copy_gss_neg_header((char *)(&rsp->hdr) + le16_to_cpu(rsp->SecurityBufferOffset)); inc_rfc1001_len(work->response_buf, sizeof(struct smb2_negotiate_rsp) - - sizeof(struct smb2_hdr) - sizeof(rsp->Buffer) + - AUTH_GSS_LENGTH); + sizeof(struct smb2_hdr) + AUTH_GSS_LENGTH); rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE; conn->use_spnego = true; diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index f684c0cd1ec5..386d6fb92793 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -3,6 +3,7 @@ netfs-y := \ buffered_read.o \ io.o \ + iterator.o \ main.o \ objects.o diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c new file mode 100644 index 000000000000..f00d43b8ac0a --- /dev/null +++ b/fs/netfs/iterator.c @@ -0,0 +1,369 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Iterator helpers. + * + * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/uio.h> +#include <linux/scatterlist.h> +#include <linux/netfs.h> +#include "internal.h" + +/** + * netfs_extract_user_iter - Extract the pages from a user iterator into a bvec + * @orig: The original iterator + * @orig_len: The amount of iterator to copy + * @new: The iterator to be set up + * @extraction_flags: Flags to qualify the request + * + * Extract the page fragments from the given amount of the source iterator and + * build up a second iterator that refers to all of those bits. This allows + * the original iterator to disposed of. + * + * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA be + * allowed on the pages extracted. + * + * On success, the number of elements in the bvec is returned, the original + * iterator will have been advanced by the amount extracted. + * + * The iov_iter_extract_mode() function should be used to query how cleanup + * should be performed. + */ +ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, + struct iov_iter *new, + iov_iter_extraction_t extraction_flags) +{ + struct bio_vec *bv = NULL; + struct page **pages; + unsigned int cur_npages; + unsigned int max_pages; + unsigned int npages = 0; + unsigned int i; + ssize_t ret; + size_t count = orig_len, offset, len; + size_t bv_size, pg_size; + + if (WARN_ON_ONCE(!iter_is_ubuf(orig) && !iter_is_iovec(orig))) + return -EIO; + + max_pages = iov_iter_npages(orig, INT_MAX); + bv_size = array_size(max_pages, sizeof(*bv)); + bv = kvmalloc(bv_size, GFP_KERNEL); + if (!bv) + return -ENOMEM; + + /* Put the page list at the end of the bvec list storage. bvec + * elements are larger than page pointers, so as long as we work + * 0->last, we should be fine. + */ + pg_size = array_size(max_pages, sizeof(*pages)); + pages = (void *)bv + bv_size - pg_size; + + while (count && npages < max_pages) { + ret = iov_iter_extract_pages(orig, &pages, count, + max_pages - npages, extraction_flags, + &offset); + if (ret < 0) { + pr_err("Couldn't get user pages (rc=%zd)\n", ret); + break; + } + + if (ret > count) { + pr_err("get_pages rc=%zd more than %zu\n", ret, count); + break; + } + + count -= ret; + ret += offset; + cur_npages = DIV_ROUND_UP(ret, PAGE_SIZE); + + if (npages + cur_npages > max_pages) { + pr_err("Out of bvec array capacity (%u vs %u)\n", + npages + cur_npages, max_pages); + break; + } + + for (i = 0; i < cur_npages; i++) { + len = ret > PAGE_SIZE ? PAGE_SIZE : ret; + bvec_set_page(bv + npages + i, *pages++, len - offset, offset); + ret -= len; + offset = 0; + } + + npages += cur_npages; + } + + iov_iter_bvec(new, orig->data_source, bv, npages, orig_len - count); + return npages; +} +EXPORT_SYMBOL_GPL(netfs_extract_user_iter); + +/* + * Extract and pin a list of up to sg_max pages from UBUF- or IOVEC-class + * iterators, and add them to the scatterlist. + */ +static ssize_t netfs_extract_user_to_sg(struct iov_iter *iter, + ssize_t maxsize, + struct sg_table *sgtable, + unsigned int sg_max, + iov_iter_extraction_t extraction_flags) +{ + struct scatterlist *sg = sgtable->sgl + sgtable->nents; + struct page **pages; + unsigned int npages; + ssize_t ret = 0, res; + size_t len, off; + + /* We decant the page list into the tail of the scatterlist */ + pages = (void *)sgtable->sgl + array_size(sg_max, sizeof(struct scatterlist)); + pages -= sg_max; + + do { + res = iov_iter_extract_pages(iter, &pages, maxsize, sg_max, + extraction_flags, &off); + if (res < 0) + goto failed; + + len = res; + maxsize -= len; + ret += len; + npages = DIV_ROUND_UP(off + len, PAGE_SIZE); + sg_max -= npages; + + for (; npages < 0; npages--) { + struct page *page = *pages; + size_t seg = min_t(size_t, PAGE_SIZE - off, len); + + *pages++ = NULL; + sg_set_page(sg, page, len, off); + sgtable->nents++; + sg++; + len -= seg; + off = 0; + } + } while (maxsize > 0 && sg_max > 0); + + return ret; + +failed: + while (sgtable->nents > sgtable->orig_nents) + put_page(sg_page(&sgtable->sgl[--sgtable->nents])); + return res; +} + +/* + * Extract up to sg_max pages from a BVEC-type iterator and add them to the + * scatterlist. The pages are not pinned. + */ +static ssize_t netfs_extract_bvec_to_sg(struct iov_iter *iter, + ssize_t maxsize, + struct sg_table *sgtable, + unsigned int sg_max, + iov_iter_extraction_t extraction_flags) +{ + const struct bio_vec *bv = iter->bvec; + struct scatterlist *sg = sgtable->sgl + sgtable->nents; + unsigned long start = iter->iov_offset; + unsigned int i; + ssize_t ret = 0; + + for (i = 0; i < iter->nr_segs; i++) { + size_t off, len; + + len = bv[i].bv_len; + if (start >= len) { + start -= len; + continue; + } + + len = min_t(size_t, maxsize, len - start); + off = bv[i].bv_offset + start; + + sg_set_page(sg, bv[i].bv_page, len, off); + sgtable->nents++; + sg++; + sg_max--; + + ret += len; + maxsize -= len; + if (maxsize <= 0 || sg_max == 0) + break; + start = 0; + } + + if (ret > 0) + iov_iter_advance(iter, ret); + return ret; +} + +/* + * Extract up to sg_max pages from a KVEC-type iterator and add them to the + * scatterlist. This can deal with vmalloc'd buffers as well as kmalloc'd or + * static buffers. The pages are not pinned. + */ +static ssize_t netfs_extract_kvec_to_sg(struct iov_iter *iter, + ssize_t maxsize, + struct sg_table *sgtable, + unsigned int sg_max, + iov_iter_extraction_t extraction_flags) +{ + const struct kvec *kv = iter->kvec; + struct scatterlist *sg = sgtable->sgl + sgtable->nents; + unsigned long start = iter->iov_offset; + unsigned int i; + ssize_t ret = 0; + + for (i = 0; i < iter->nr_segs; i++) { + struct page *page; + unsigned long kaddr; + size_t off, len, seg; + + len = kv[i].iov_len; + if (start >= len) { + start -= len; + continue; + } + + kaddr = (unsigned long)kv[i].iov_base + start; + off = kaddr & ~PAGE_MASK; + len = min_t(size_t, maxsize, len - start); + kaddr &= PAGE_MASK; + + maxsize -= len; + ret += len; + do { + seg = min_t(size_t, len, PAGE_SIZE - off); + if (is_vmalloc_or_module_addr((void *)kaddr)) + page = vmalloc_to_page((void *)kaddr); + else + page = virt_to_page(kaddr); + + sg_set_page(sg, page, len, off); + sgtable->nents++; + sg++; + sg_max--; + + len -= seg; + kaddr += PAGE_SIZE; + off = 0; + } while (len > 0 && sg_max > 0); + + if (maxsize <= 0 || sg_max == 0) + break; + start = 0; + } + + if (ret > 0) + iov_iter_advance(iter, ret); + return ret; +} + +/* + * Extract up to sg_max folios from an XARRAY-type iterator and add them to + * the scatterlist. The pages are not pinned. + */ +static ssize_t netfs_extract_xarray_to_sg(struct iov_iter *iter, + ssize_t maxsize, + struct sg_table *sgtable, + unsigned int sg_max, + iov_iter_extraction_t extraction_flags) +{ + struct scatterlist *sg = sgtable->sgl + sgtable->nents; + struct xarray *xa = iter->xarray; + struct folio *folio; + loff_t start = iter->xarray_start + iter->iov_offset; + pgoff_t index = start / PAGE_SIZE; + ssize_t ret = 0; + size_t offset, len; + XA_STATE(xas, xa, index); + + rcu_read_lock(); + + xas_for_each(&xas, folio, ULONG_MAX) { + if (xas_retry(&xas, folio)) + continue; + if (WARN_ON(xa_is_value(folio))) + break; + if (WARN_ON(folio_test_hugetlb(folio))) + break; + + offset = offset_in_folio(folio, start); + len = min_t(size_t, maxsize, folio_size(folio) - offset); + + sg_set_page(sg, folio_page(folio, 0), len, offset); + sgtable->nents++; + sg++; + sg_max--; + + maxsize -= len; + ret += len; + if (maxsize <= 0 || sg_max == 0) + break; + } + + rcu_read_unlock(); + if (ret > 0) + iov_iter_advance(iter, ret); + return ret; +} + +/** + * netfs_extract_iter_to_sg - Extract pages from an iterator and add ot an sglist + * @iter: The iterator to extract from + * @maxsize: The amount of iterator to copy + * @sgtable: The scatterlist table to fill in + * @sg_max: Maximum number of elements in @sgtable that may be filled + * @extraction_flags: Flags to qualify the request + * + * Extract the page fragments from the given amount of the source iterator and + * add them to a scatterlist that refers to all of those bits, to a maximum + * addition of @sg_max elements. + * + * The pages referred to by UBUF- and IOVEC-type iterators are extracted and + * pinned; BVEC-, KVEC- and XARRAY-type are extracted but aren't pinned; PIPE- + * and DISCARD-type are not supported. + * + * No end mark is placed on the scatterlist; that's left to the caller. + * + * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA + * be allowed on the pages extracted. + * + * If successul, @sgtable->nents is updated to include the number of elements + * added and the number of bytes added is returned. @sgtable->orig_nents is + * left unaltered. + * + * The iov_iter_extract_mode() function should be used to query how cleanup + * should be performed. + */ +ssize_t netfs_extract_iter_to_sg(struct iov_iter *iter, size_t maxsize, + struct sg_table *sgtable, unsigned int sg_max, + iov_iter_extraction_t extraction_flags) +{ + if (maxsize == 0) + return 0; + + switch (iov_iter_type(iter)) { + case ITER_UBUF: + case ITER_IOVEC: + return netfs_extract_user_to_sg(iter, maxsize, sgtable, sg_max, + extraction_flags); + case ITER_BVEC: + return netfs_extract_bvec_to_sg(iter, maxsize, sgtable, sg_max, + extraction_flags); + case ITER_KVEC: + return netfs_extract_kvec_to_sg(iter, maxsize, sgtable, sg_max, + extraction_flags); + case ITER_XARRAY: + return netfs_extract_xarray_to_sg(iter, maxsize, sgtable, sg_max, + extraction_flags); + default: + pr_err("%s(%u) unsupported\n", __func__, iov_iter_type(iter)); + WARN_ON_ONCE(1); + return -EIO; + } +} +EXPORT_SYMBOL_GPL(netfs_extract_iter_to_sg); diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h index 7d605db3bb3b..ace133cf6072 100644 --- a/fs/smbfs_common/smb2pdu.h +++ b/fs/smbfs_common/smb2pdu.h @@ -167,7 +167,7 @@ struct smb2_err_rsp { __u8 ErrorContextCount; __u8 Reserved; __le32 ByteCount; /* even if zero, at least one byte follows */ - __u8 ErrorData[1]; /* variable length */ + __u8 ErrorData[]; /* variable length */ } __packed; #define SMB3_AES_CCM_NONCE 11 @@ -308,7 +308,7 @@ struct smb2_tree_connect_req { __le16 Flags; /* Flags in SMB3.1.1 */ __le16 PathOffset; __le16 PathLength; - __u8 Buffer[1]; /* variable length */ + __u8 Buffer[]; /* variable length */ } __packed; /* Possible ShareType values */ @@ -595,7 +595,7 @@ struct smb2_negotiate_rsp { __le16 SecurityBufferOffset; __le16 SecurityBufferLength; __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */ - __u8 Buffer[1]; /* variable length GSS security buffer */ + __u8 Buffer[]; /* variable length GSS security buffer */ } __packed; @@ -616,7 +616,7 @@ struct smb2_sess_setup_req { __le16 SecurityBufferOffset; __le16 SecurityBufferLength; __le64 PreviousSessionId; - __u8 Buffer[1]; /* variable length GSS security buffer */ + __u8 Buffer[]; /* variable length GSS security buffer */ } __packed; /* Currently defined SessionFlags */ @@ -633,7 +633,7 @@ struct smb2_sess_setup_rsp { __le16 SessionFlags; __le16 SecurityBufferOffset; __le16 SecurityBufferLength; - __u8 Buffer[1]; /* variable length GSS security buffer */ + __u8 Buffer[]; /* variable length GSS security buffer */ } __packed; @@ -715,7 +715,7 @@ struct smb2_read_req { __le32 RemainingBytes; __le16 ReadChannelInfoOffset; __le16 ReadChannelInfoLength; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; /* Read flags */ @@ -730,7 +730,7 @@ struct smb2_read_rsp { __le32 DataLength; __le32 DataRemaining; __le32 Flags; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; @@ -754,7 +754,7 @@ struct smb2_write_req { __le16 WriteChannelInfoOffset; __le16 WriteChannelInfoLength; __le32 Flags; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; struct smb2_write_rsp { @@ -765,7 +765,7 @@ struct smb2_write_rsp { __le32 DataLength; __le32 DataRemaining; __u32 Reserved2; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; @@ -812,7 +812,10 @@ struct smb2_lock_req { __u64 PersistentFileId; __u64 VolatileFileId; /* Followed by at least one */ - struct smb2_lock_element locks[1]; + union { + struct smb2_lock_element lock; + DECLARE_FLEX_ARRAY(struct smb2_lock_element, locks); + }; } __packed; struct smb2_lock_rsp { @@ -866,7 +869,7 @@ struct smb2_query_directory_req { __le16 FileNameOffset; __le16 FileNameLength; __le32 OutputBufferLength; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; struct smb2_query_directory_rsp { @@ -874,7 +877,7 @@ struct smb2_query_directory_rsp { __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; /* @@ -897,7 +900,7 @@ struct smb2_set_info_req { __le32 AdditionalInformation; __u64 PersistentFileId; __u64 VolatileFileId; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; struct smb2_set_info_rsp { @@ -952,7 +955,7 @@ struct smb2_change_notify_rsp { __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; - __u8 Buffer[1]; /* array of file notify structs */ + __u8 Buffer[]; /* array of file notify structs */ } __packed; @@ -1158,7 +1161,7 @@ struct smb2_create_rsp { __u64 VolatileFileId; __le32 CreateContextsOffset; __le32 CreateContextsLength; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; struct create_posix { @@ -1501,7 +1504,7 @@ struct smb2_query_info_req { __le32 Flags; __u64 PersistentFileId; __u64 VolatileFileId; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; struct smb2_query_info_rsp { @@ -1509,7 +1512,7 @@ struct smb2_query_info_rsp { __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; /* @@ -1570,7 +1573,10 @@ struct smb2_file_all_info { /* data block encoding of response to level 18 */ __le32 Mode; __le32 AlignmentRequirement; __le32 FileNameLength; - char FileName[1]; + union { + char __pad; /* Legacy structure padding */ + DECLARE_FLEX_ARRAY(char, FileName); + }; } __packed; /* level 18 Query */ struct smb2_file_eof_info { /* encoding of request for level 10 */ diff --git a/fs/splice.c b/fs/splice.c index 87d9b19349de..2e76dbb81a8f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -282,6 +282,99 @@ void splice_shrink_spd(struct splice_pipe_desc *spd) kfree(spd->partial); } +/* + * Splice data from an O_DIRECT file into pages and then add them to the output + * pipe. + */ +ssize_t direct_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct iov_iter to; + struct bio_vec *bv; + struct kiocb kiocb; + struct page **pages; + ssize_t ret; + size_t used, npages, chunk, remain, reclaim; + int i; + + /* Work out how much data we can actually add into the pipe */ + used = pipe_occupancy(pipe->head, pipe->tail); + npages = max_t(ssize_t, pipe->max_usage - used, 0); + len = min_t(size_t, len, npages * PAGE_SIZE); + npages = DIV_ROUND_UP(len, PAGE_SIZE); + + bv = kzalloc(array_size(npages, sizeof(bv[0])) + + array_size(npages, sizeof(struct page *)), GFP_KERNEL); + if (!bv) + return -ENOMEM; + + pages = (void *)(bv + npages); + npages = alloc_pages_bulk_array(GFP_USER, npages, pages); + if (!npages) { + kfree(bv); + return -ENOMEM; + } + + remain = len = min_t(size_t, len, npages * PAGE_SIZE); + + for (i = 0; i < npages; i++) { + chunk = min_t(size_t, PAGE_SIZE, remain); + bv[i].bv_page = pages[i]; + bv[i].bv_offset = 0; + bv[i].bv_len = chunk; + remain -= chunk; + } + + /* Do the I/O */ + iov_iter_bvec(&to, ITER_DEST, bv, npages, len); + init_sync_kiocb(&kiocb, in); + kiocb.ki_pos = *ppos; + ret = call_read_iter(in, &kiocb, &to); + + reclaim = npages * PAGE_SIZE; + remain = 0; + if (ret > 0) { + reclaim -= ret; + remain = ret; + *ppos = kiocb.ki_pos; + file_accessed(in); + } else if (ret < 0) { + /* + * callers of ->splice_read() expect -EAGAIN on + * "can't put anything in there", rather than -EFAULT. + */ + if (ret == -EFAULT) + ret = -EAGAIN; + } + + /* Free any pages that didn't get touched at all. */ + reclaim /= PAGE_SIZE; + if (reclaim) { + npages -= reclaim; + release_pages(pages + npages, reclaim); + } + + /* Push the remaining pages into the pipe. */ + for (i = 0; i < npages; i++) { + struct pipe_buffer *buf = pipe_head_buf(pipe); + + chunk = min_t(size_t, remain, PAGE_SIZE); + *buf = (struct pipe_buffer) { + .ops = &default_pipe_buf_ops, + .page = bv[i].bv_page, + .offset = 0, + .len = chunk, + }; + pipe->head++; + remain -= chunk; + } + + kfree(bv); + return ret; +} +EXPORT_SYMBOL(direct_splice_read); + /** * generic_file_splice_read - splice data from file to a pipe * @in: file to splice from diff --git a/include/linux/fs.h b/include/linux/fs.h index 2acc46fb5f97..d46ae1e525fc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2728,6 +2728,12 @@ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter); /* fs/splice.c */ +ssize_t filemap_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags); +ssize_t direct_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags); extern ssize_t generic_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); extern ssize_t iter_file_splice_write(struct pipe_inode_info *, diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 4c76ddfb6a67..a1f3522daa69 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -17,6 +17,7 @@ #include <linux/workqueue.h> #include <linux/fs.h> #include <linux/pagemap.h> +#include <linux/uio.h> enum netfs_sreq_ref_trace; @@ -296,6 +297,13 @@ void netfs_get_subrequest(struct netfs_io_subrequest *subreq, void netfs_put_subrequest(struct netfs_io_subrequest *subreq, bool was_async, enum netfs_sreq_ref_trace what); void netfs_stats_show(struct seq_file *); +ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, + struct iov_iter *new, + iov_iter_extraction_t extraction_flags); +struct sg_table; +ssize_t netfs_extract_iter_to_sg(struct iov_iter *iter, size_t len, + struct sg_table *sgtable, unsigned int sg_max, + iov_iter_extraction_t extraction_flags); /** * netfs_inode - Get the netfs inode context from the inode diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 29e1f9e76eb6..2f5b36f446cc 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -978,16 +978,6 @@ static inline int folio_lock_killable(struct folio *folio) } /* - * lock_page_killable is like lock_page but can be interrupted by fatal - * signals. It returns 0 if it locked the page and -EINTR if it was - * killed while waiting. - */ -static inline int lock_page_killable(struct page *page) -{ - return folio_lock_killable(page_folio(page)); -} - -/* * folio_lock_or_retry - Lock the folio, unless this would block and the * caller indicated that it can handle a retry. * diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 6cb65df3e3ba..d2c3f16cf6b1 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -157,6 +157,26 @@ static inline bool pipe_full(unsigned int head, unsigned int tail, } /** + * pipe_buf - Return the pipe buffer for the specified slot in the pipe ring + * @pipe: The pipe to access + * @slot: The slot of interest + */ +static inline struct pipe_buffer *pipe_buf(const struct pipe_inode_info *pipe, + unsigned int slot) +{ + return &pipe->bufs[slot & (pipe->ring_size - 1)]; +} + +/** + * pipe_head_buf - Return the pipe buffer at the head of the pipe ring + * @pipe: The pipe to access + */ +static inline struct pipe_buffer *pipe_head_buf(const struct pipe_inode_info *pipe) +{ + return pipe_buf(pipe, pipe->head); +} + +/** * pipe_buf_get - get a reference to a pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to get a reference to diff --git a/include/linux/uio.h b/include/linux/uio.h index 73b1d5d1e4f1..27e3fd942960 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -13,6 +13,8 @@ struct page; struct pipe_inode_info; +typedef unsigned int __bitwise iov_iter_extraction_t; + struct kvec { void *iov_base; /* and that should *never* hold a userland pointer */ size_t iov_len; @@ -252,12 +254,12 @@ void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray * loff_t start, size_t count); ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start, - unsigned gup_flags); + iov_iter_extraction_t extraction_flags); ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start, - unsigned gup_flags); + iov_iter_extraction_t extraction_flags); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start); int iov_iter_npages(const struct iov_iter *i, int maxpages); @@ -360,5 +362,34 @@ static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, .count = count }; } +/* Flags for iov_iter_get/extract_pages*() */ +/* Allow P2PDMA on the extracted pages */ +#define ITER_ALLOW_P2PDMA ((__force iov_iter_extraction_t)0x01) + +ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages, + size_t maxsize, unsigned int maxpages, + iov_iter_extraction_t extraction_flags, + size_t *offset0); + +/** + * iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained + * @iter: The iterator + * + * Examine the iterator and indicate by returning true or false as to how, if + * at all, pages extracted from the iterator will be retained by the extraction + * function. + * + * %true indicates that the pages will have a pin placed in them that the + * caller must unpin. This is must be done for DMA/async DIO to force fork() + * to forcibly copy a page for the child (the parent must retain the original + * page). + * + * %false indicates that no measures are taken and that it's up to the caller + * to retain the pages. + */ +static inline bool iov_iter_extract_will_pin(const struct iov_iter *iter) +{ + return user_backed_iter(iter); +} #endif diff --git a/lib/iov_iter.c b/lib/iov_iter.c index d9b3332c8405..274014e4eafe 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -186,12 +186,6 @@ static int copyin(void *to, const void __user *from, size_t n) return res; } -static inline struct pipe_buffer *pipe_buf(const struct pipe_inode_info *pipe, - unsigned int slot) -{ - return &pipe->bufs[slot & (pipe->ring_size - 1)]; -} - #ifdef PIPE_PARANOIA static bool sanity(const struct iov_iter *i) { @@ -1432,9 +1426,9 @@ static struct page *first_bvec_segment(const struct iov_iter *i, static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, size_t *start, - unsigned int gup_flags) + iov_iter_extraction_t extraction_flags) { - unsigned int n; + unsigned int n, gup_flags = 0; if (maxsize > i->count) maxsize = i->count; @@ -1442,6 +1436,8 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, return 0; if (maxsize > MAX_RW_COUNT) maxsize = MAX_RW_COUNT; + if (extraction_flags & ITER_ALLOW_P2PDMA) + gup_flags |= FOLL_PCI_P2PDMA; if (likely(user_backed_iter(i))) { unsigned long addr; @@ -1495,14 +1491,14 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, - size_t *start, unsigned gup_flags) + size_t *start, iov_iter_extraction_t extraction_flags) { if (!maxpages) return 0; BUG_ON(!pages); return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, - start, gup_flags); + start, extraction_flags); } EXPORT_SYMBOL_GPL(iov_iter_get_pages); @@ -1515,14 +1511,14 @@ EXPORT_SYMBOL(iov_iter_get_pages2); ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, - size_t *start, unsigned gup_flags) + size_t *start, iov_iter_extraction_t extraction_flags) { ssize_t len; *pages = NULL; len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start, - gup_flags); + extraction_flags); if (len <= 0) { kvfree(*pages); *pages = NULL; @@ -1925,3 +1921,267 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) i->iov -= state->nr_segs - i->nr_segs; i->nr_segs = state->nr_segs; } + +/* + * Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not + * get references on the pages, nor does it get a pin on them. + */ +static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, + struct page ***pages, size_t maxsize, + unsigned int maxpages, + iov_iter_extraction_t extraction_flags, + size_t *offset0) +{ + struct page *page, **p; + unsigned int nr = 0, offset; + loff_t pos = i->xarray_start + i->iov_offset; + pgoff_t index = pos >> PAGE_SHIFT; + XA_STATE(xas, i->xarray, index); + + offset = pos & ~PAGE_MASK; + *offset0 = offset; + + maxpages = want_pages_array(pages, maxsize, offset, maxpages); + if (!maxpages) + return -ENOMEM; + p = *pages; + + rcu_read_lock(); + for (page = xas_load(&xas); page; page = xas_next(&xas)) { + if (xas_retry(&xas, page)) + continue; + + /* Has the page moved or been split? */ + if (unlikely(page != xas_reload(&xas))) { + xas_reset(&xas); + continue; + } + + p[nr++] = find_subpage(page, xas.xa_index); + if (nr == maxpages) + break; + } + rcu_read_unlock(); + + maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); + iov_iter_advance(i, maxsize); + return maxsize; +} + +/* + * Extract a list of contiguous pages from an ITER_BVEC iterator. This does + * not get references on the pages, nor does it get a pin on them. + */ +static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, + struct page ***pages, size_t maxsize, + unsigned int maxpages, + iov_iter_extraction_t extraction_flags, + size_t *offset0) +{ + struct page **p, *page; + size_t skip = i->iov_offset, offset; + int k; + + for (;;) { + if (i->nr_segs == 0) + return 0; + maxsize = min(maxsize, i->bvec->bv_len - skip); + if (maxsize) + break; + i->iov_offset = 0; + i->nr_segs--; + i->bvec++; + skip = 0; + } + + skip += i->bvec->bv_offset; + page = i->bvec->bv_page + skip / PAGE_SIZE; + offset = skip % PAGE_SIZE; + *offset0 = offset; + + maxpages = want_pages_array(pages, maxsize, offset, maxpages); + if (!maxpages) + return -ENOMEM; + p = *pages; + for (k = 0; k < maxpages; k++) + p[k] = page + k; + + maxsize = min_t(size_t, maxsize, maxpages * PAGE_SIZE - offset); + iov_iter_advance(i, maxsize); + return maxsize; +} + +/* + * Extract a list of virtually contiguous pages from an ITER_KVEC iterator. + * This does not get references on the pages, nor does it get a pin on them. + */ +static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i, + struct page ***pages, size_t maxsize, + unsigned int maxpages, + iov_iter_extraction_t extraction_flags, + size_t *offset0) +{ + struct page **p, *page; + const void *kaddr; + size_t skip = i->iov_offset, offset, len; + int k; + + for (;;) { + if (i->nr_segs == 0) + return 0; + maxsize = min(maxsize, i->kvec->iov_len - skip); + if (maxsize) + break; + i->iov_offset = 0; + i->nr_segs--; + i->kvec++; + skip = 0; + } + + kaddr = i->kvec->iov_base + skip; + offset = (unsigned long)kaddr & ~PAGE_MASK; + *offset0 = offset; + + maxpages = want_pages_array(pages, maxsize, offset, maxpages); + if (!maxpages) + return -ENOMEM; + p = *pages; + + kaddr -= offset; + len = offset + maxsize; + for (k = 0; k < maxpages; k++) { + size_t seg = min_t(size_t, len, PAGE_SIZE); + + if (is_vmalloc_or_module_addr(kaddr)) + page = vmalloc_to_page(kaddr); + else + page = virt_to_page(kaddr); + + p[k] = page; + len -= seg; + kaddr += PAGE_SIZE; + } + + maxsize = min_t(size_t, maxsize, maxpages * PAGE_SIZE - offset); + iov_iter_advance(i, maxsize); + return maxsize; +} + +/* + * Extract a list of contiguous pages from a user iterator and get a pin on + * each of them. This should only be used if the iterator is user-backed + * (IOBUF/UBUF). + * + * It does not get refs on the pages, but the pages must be unpinned by the + * caller once the transfer is complete. + * + * This is safe to be used where background IO/DMA *is* going to be modifying + * the buffer; using a pin rather than a ref makes forces fork() to give the + * child a copy of the page. + */ +static ssize_t iov_iter_extract_user_pages(struct iov_iter *i, + struct page ***pages, + size_t maxsize, + unsigned int maxpages, + iov_iter_extraction_t extraction_flags, + size_t *offset0) +{ + unsigned long addr; + unsigned int gup_flags = 0; + size_t offset; + int res; + + if (i->data_source == ITER_DEST) + gup_flags |= FOLL_WRITE; + if (extraction_flags & ITER_ALLOW_P2PDMA) + gup_flags |= FOLL_PCI_P2PDMA; + if (i->nofault) + gup_flags |= FOLL_NOFAULT; + + addr = first_iovec_segment(i, &maxsize); + *offset0 = offset = addr % PAGE_SIZE; + addr &= PAGE_MASK; + maxpages = want_pages_array(pages, maxsize, offset, maxpages); + if (!maxpages) + return -ENOMEM; + res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages); + if (unlikely(res <= 0)) + return res; + maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset); + iov_iter_advance(i, maxsize); + return maxsize; +} + +/** + * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator + * @i: The iterator to extract from + * @pages: Where to return the list of pages + * @maxsize: The maximum amount of iterator to extract + * @maxpages: The maximum size of the list of pages + * @extraction_flags: Flags to qualify request + * @offset0: Where to return the starting offset into (*@pages)[0] + * + * Extract a list of contiguous pages from the current point of the iterator, + * advancing the iterator. The maximum number of pages and the maximum amount + * of page contents can be set. + * + * If *@pages is NULL, a page list will be allocated to the required size and + * *@pages will be set to its base. If *@pages is not NULL, it will be assumed + * that the caller allocated a page list at least @maxpages in size and this + * will be filled in. + * + * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA + * be allowed on the pages extracted. + * + * The iov_iter_extract_will_pin() function can be used to query how cleanup + * should be performed. + * + * Extra refs or pins on the pages may be obtained as follows: + * + * (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be + * added to the pages, but refs will not be taken. + * iov_iter_extract_will_pin() will return true. + * + * (*) If the iterator is ITER_KVEC, ITER_BVEC or ITER_XARRAY, the pages are + * merely listed; no extra refs or pins are obtained. + * iov_iter_extract_will_pin() will return 0. + * + * Note also: + * + * (*) Use with ITER_DISCARD is not supported as that has no content. + * + * On success, the function sets *@pages to the new pagelist, if allocated, and + * sets *offset0 to the offset into the first page. + * + * It may also return -ENOMEM and -EFAULT. + */ +ssize_t iov_iter_extract_pages(struct iov_iter *i, + struct page ***pages, + size_t maxsize, + unsigned int maxpages, + iov_iter_extraction_t extraction_flags, + size_t *offset0) +{ + maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT); + if (!maxsize) + return 0; + + if (likely(user_backed_iter(i))) + return iov_iter_extract_user_pages(i, pages, maxsize, + maxpages, extraction_flags, + offset0); + if (iov_iter_is_kvec(i)) + return iov_iter_extract_kvec_pages(i, pages, maxsize, + maxpages, extraction_flags, + offset0); + if (iov_iter_is_bvec(i)) + return iov_iter_extract_bvec_pages(i, pages, maxsize, + maxpages, extraction_flags, + offset0); + if (iov_iter_is_xarray(i)) + return iov_iter_extract_xarray_pages(i, pages, maxsize, + maxpages, extraction_flags, + offset0); + return -EFAULT; +} +EXPORT_SYMBOL_GPL(iov_iter_extract_pages); diff --git a/mm/filemap.c b/mm/filemap.c index 0e20a8d6dd93..b794943bce76 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -42,6 +42,8 @@ #include <linux/ramfs.h> #include <linux/page_idle.h> #include <linux/migrate.h> +#include <linux/pipe_fs_i.h> +#include <linux/splice.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "internal.h" @@ -2440,21 +2442,19 @@ static int filemap_read_folio(struct file *file, filler_t filler, } static bool filemap_range_uptodate(struct address_space *mapping, - loff_t pos, struct iov_iter *iter, struct folio *folio) + loff_t pos, size_t count, struct folio *folio, + bool need_uptodate) { - int count; - if (folio_test_uptodate(folio)) return true; /* pipes can't handle partially uptodate pages */ - if (iov_iter_is_pipe(iter)) + if (need_uptodate) return false; if (!mapping->a_ops->is_partially_uptodate) return false; if (mapping->host->i_blkbits >= folio_shift(folio)) return false; - count = iter->count; if (folio_pos(folio) > pos) { count -= folio_pos(folio) - pos; pos = 0; @@ -2466,8 +2466,8 @@ static bool filemap_range_uptodate(struct address_space *mapping, } static int filemap_update_page(struct kiocb *iocb, - struct address_space *mapping, struct iov_iter *iter, - struct folio *folio) + struct address_space *mapping, size_t count, + struct folio *folio, bool need_uptodate) { int error; @@ -2501,7 +2501,8 @@ static int filemap_update_page(struct kiocb *iocb, goto unlock; error = 0; - if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, folio)) + if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio, + need_uptodate)) goto unlock; error = -EAGAIN; @@ -2577,8 +2578,8 @@ static int filemap_readahead(struct kiocb *iocb, struct file *file, return 0; } -static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, - struct folio_batch *fbatch) +static int filemap_get_pages(struct kiocb *iocb, size_t count, + struct folio_batch *fbatch, bool need_uptodate) { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; @@ -2589,7 +2590,7 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, int err = 0; /* "last_index" is the index of the page beyond the end of the read */ - last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE); + last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE); retry: if (fatal_signal_pending(current)) return -EINTR; @@ -2622,7 +2623,8 @@ retry: if ((iocb->ki_flags & IOCB_WAITQ) && folio_batch_count(fbatch) > 1) iocb->ki_flags |= IOCB_NOWAIT; - err = filemap_update_page(iocb, mapping, iter, folio); + err = filemap_update_page(iocb, mapping, count, folio, + need_uptodate); if (err) goto err; } @@ -2692,7 +2694,8 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(iocb->ki_pos >= i_size_read(inode))) break; - error = filemap_get_pages(iocb, iter, &fbatch); + error = filemap_get_pages(iocb, iter->count, &fbatch, + iov_iter_is_pipe(iter)); if (error < 0) break; @@ -2842,6 +2845,134 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) } EXPORT_SYMBOL(generic_file_read_iter); +/* + * Splice subpages from a folio into a pipe. + */ +size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, + struct folio *folio, loff_t fpos, size_t size) +{ + struct page *page; + size_t spliced = 0, offset = offset_in_folio(folio, fpos); + + page = folio_page(folio, offset / PAGE_SIZE); + size = min(size, folio_size(folio) - offset); + offset %= PAGE_SIZE; + + while (spliced < size && + !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + struct pipe_buffer *buf = pipe_head_buf(pipe); + size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced); + + *buf = (struct pipe_buffer) { + .ops = &page_cache_pipe_buf_ops, + .page = page, + .offset = offset, + .len = part, + }; + folio_get(folio); + pipe->head++; + page++; + spliced += part; + offset = 0; + } + + return spliced; +} + +/* + * Splice folios from the pagecache of a buffered (ie. non-O_DIRECT) file into + * a pipe. + */ +ssize_t filemap_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct folio_batch fbatch; + struct kiocb iocb; + size_t total_spliced = 0, used, npages; + loff_t isize, end_offset; + bool writably_mapped; + int i, error = 0; + + init_sync_kiocb(&iocb, in); + iocb.ki_pos = *ppos; + + /* Work out how much data we can actually add into the pipe */ + used = pipe_occupancy(pipe->head, pipe->tail); + npages = max_t(ssize_t, pipe->max_usage - used, 0); + len = min_t(size_t, len, npages * PAGE_SIZE); + + folio_batch_init(&fbatch); + + do { + cond_resched(); + + if (*ppos >= i_size_read(file_inode(in))) + break; + + iocb.ki_pos = *ppos; + error = filemap_get_pages(&iocb, len, &fbatch, true); + if (error < 0) + break; + + /* + * i_size must be checked after we know the pages are Uptodate. + * + * Checking i_size after the check allows us to calculate + * the correct value for "nr", which means the zero-filled + * part of the page is not copied back to userspace (unless + * another truncate extends the file - this is desired though). + */ + isize = i_size_read(file_inode(in)); + if (unlikely(*ppos >= isize)) + break; + end_offset = min_t(loff_t, isize, *ppos + len); + + /* + * Once we start copying data, we don't want to be touching any + * cachelines that might be contended: + */ + writably_mapped = mapping_writably_mapped(in->f_mapping); + + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + size_t n; + + if (folio_pos(folio) >= end_offset) + goto out; + folio_mark_accessed(folio); + + /* + * If users can be writing to this folio using arbitrary + * virtual addresses, take care of potential aliasing + * before reading the folio on the kernel side. + */ + if (writably_mapped) + flush_dcache_folio(folio); + + n = min_t(loff_t, len, isize - *ppos); + n = splice_folio_into_pipe(pipe, folio, *ppos, n); + if (!n) + goto out; + len -= n; + total_spliced += n; + *ppos += n; + in->f_ra.prev_pos = *ppos; + if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + goto out; + } + + folio_batch_release(&fbatch); + } while (len); + +out: + folio_batch_release(&fbatch); + file_accessed(in); + + return total_spliced ? total_spliced : error; +} +EXPORT_SYMBOL(filemap_splice_read); + static inline loff_t folio_seek_hole_data(struct xa_state *xas, struct address_space *mapping, struct folio *folio, loff_t start, loff_t end, bool seek_data) diff --git a/mm/internal.h b/mm/internal.h index bcf75a8b032d..6d4ca98f3844 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -795,6 +795,12 @@ struct migration_target_control { }; /* + * mm/filemap.c + */ +size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, + struct folio *folio, loff_t fpos, size_t size); + +/* * mm/vmalloc.c */ #ifdef CONFIG_MMU diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ca71de7c9d77..61f5bec0f2b6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -656,6 +656,7 @@ int is_vmalloc_or_module_addr(const void *x) #endif return is_vmalloc_addr(x); } +EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr); /* * Walk a vmap address to the struct page it maps. Huge vmap mappings will |