From aad11473f8f4be3df86461081ce35ec5b145ba68 Mon Sep 17 00:00:00 2001 From: Dmitry Mastykin Date: Wed, 22 May 2024 10:45:24 +0300 Subject: [PATCH 1/9] NFSv4: Fix memory leak in nfs4_set_security_label We leak nfs_fattr and nfs4_label every time we set a security xattr. Signed-off-by: Dmitry Mastykin Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c93c12063b3a..94c07875aa3f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6268,6 +6268,7 @@ nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen) if (status == 0) nfs_setsecurity(inode, fattr); + nfs_free_fattr(fattr); return status; } #endif /* CONFIG_NFS_V4_SECURITY_LABEL */ From 6cbe14f42be3b596e9590d48e12436982ba26e4b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 22 May 2024 11:27:32 -0400 Subject: [PATCH 2/9] MAINTAINERS: Change email address for Trond Myklebust Ensure that patches are sent to an email server that won't corrupt them. Signed-off-by: Trond Myklebust --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 28e20975c26f..481d76fbf16c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15549,7 +15549,7 @@ F: drivers/nfc/virtual_ncidev.c F: tools/testing/selftests/nci/ NFS, SUNRPC, AND LOCKD CLIENTS -M: Trond Myklebust +M: Trond Myklebust M: Anna Schumaker L: linux-nfs@vger.kernel.org S: Maintained From 134d0b3f2440cdddd12fc3444c9c0f62331ce6fc Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 21 May 2024 15:58:40 +0300 Subject: [PATCH 3/9] nfs: propagate readlink errors in nfs_symlink_filler There is an inherent race where a symlink file may have been overriden (by a different client) between lookup and readlink, resulting in a spurious EIO error returned to userspace. Fix this by propagating back ESTALE errors such that the vfs will retry the lookup/get_link (similar to nfs4_file_open) at least once. Cc: Dan Aloni Signed-off-by: Sagi Grimberg Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/symlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 0e27a2e4e68b..13818129d268 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -41,7 +41,7 @@ static int nfs_symlink_filler(struct file *file, struct folio *folio) error: folio_set_error(folio); folio_unlock(folio); - return -EIO; + return error; } static const char *nfs_get_link(struct dentry *dentry, From a527c3ba41c4c61e2069bfce4091e5515f06a8dd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 24 May 2024 18:14:19 +0200 Subject: [PATCH 4/9] nfs: Avoid flushing many pages with NFS_FILE_SYNC When we are doing WB_SYNC_ALL writeback, nfs submits write requests with NFS_FILE_SYNC flag to the server (which then generally treats it as an O_SYNC write). This helps to reduce latency for single requests but when submitting more requests, additional fsyncs on the server side hurt latency. NFS generally avoids this additional overhead by not setting NFS_FILE_SYNC if desc->pg_moreio is set. However this logic doesn't always work. When we do random 4k writes to a huge file and then call fsync(2), each page writeback is going to be sent with NFS_FILE_SYNC because after preparing one page for writeback, we start writing back next, nfs_do_writepage() will call nfs_pageio_cond_complete() which finds the page is not contiguous with previously prepared IO and submits is *without* setting desc->pg_moreio. Hence NFS_FILE_SYNC is used resulting in poor performance. Fix the problem by setting desc->pg_moreio in nfs_pageio_cond_complete() before submitting outstanding IO. This improves throughput of fsync-after-random-writes on my test SSD from ~70MB/s to ~250MB/s. Signed-off-by: Jan Kara Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 6efb5068c116..040b6b79c75e 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -1545,6 +1545,11 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) continue; } else if (index == prev->wb_index + 1) continue; + /* + * We will submit more requests after these. Indicate + * this to the underlying layers. + */ + desc->pg_moreio = 1; nfs_pageio_complete(desc); break; } From 0c8c7c559740d2d8b66048162af6c4dba8f0c88c Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Thu, 23 May 2024 15:01:22 -0400 Subject: [PATCH 5/9] nfs: don't invalidate dentries on transient errors This is a slight variation on a patch previously proposed by Neil Brown that never got merged. Prior to commit 5ceb9d7fdaaf ("NFS: Refactor nfs_lookup_revalidate()"), any error from nfs_lookup_verify_inode() other than -ESTALE would result in nfs_lookup_revalidate() returning that error (-ESTALE is mapped to zero). Since that commit, all errors result in nfs_lookup_revalidate() returning zero, resulting in dentries being invalidated where they previously were not (particularly in the case of -ERESTARTSYS). Fix it by passing the actual error code to nfs_lookup_revalidate_done(), and leaving the decision on whether to map the error code to zero or one to nfs_lookup_revalidate_done(). A simple reproducer is to run the following python code in a subdirectory of an NFS mount (not in the root of the NFS mount): ---8<--- import os import multiprocessing import time if __name__=="__main__": multiprocessing.set_start_method("spawn") count = 0 while True: try: os.getcwd() pool = multiprocessing.Pool(10) pool.close() pool.terminate() count += 1 except Exception as e: print(f"Failed after {count} iterations") print(e) break ---8<--- Prior to commit 5ceb9d7fdaaf, the above code would run indefinitely. After commit 5ceb9d7fdaaf, it fails almost immediately with -ENOENT. Signed-off-by: Scott Mayhew Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 342930996226..788077a4feb9 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1627,7 +1627,16 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry, switch (error) { case 1: break; - case 0: + case -ETIMEDOUT: + if (inode && (IS_ROOT(dentry) || + NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL)) + error = 1; + break; + case -ESTALE: + case -ENOENT: + error = 0; + fallthrough; + default: /* * We can't d_drop the root of a disconnected tree: * its d_hash is on the s_anon list and d_drop() would hide @@ -1682,18 +1691,8 @@ static int nfs_lookup_revalidate_dentry(struct inode *dir, dir_verifier = nfs_save_change_attribute(dir); ret = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr); - if (ret < 0) { - switch (ret) { - case -ESTALE: - case -ENOENT: - ret = 0; - break; - case -ETIMEDOUT: - if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL) - ret = 1; - } + if (ret < 0) goto out; - } /* Request help from readdirplus */ nfs_lookup_advise_force_readdirplus(dir, flags); @@ -1737,7 +1736,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode; - int error; + int error = 0; nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); inode = d_inode(dentry); @@ -1782,7 +1781,7 @@ out_valid: out_bad: if (flags & LOOKUP_RCU) return -ECHILD; - return nfs_lookup_revalidate_done(dir, dentry, inode, 0); + return nfs_lookup_revalidate_done(dir, dentry, inode, error); } static int From 296f4ce81d08e73c22408c49f4938a85bd075e5c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 29 May 2024 09:11:36 +1000 Subject: [PATCH 6/9] NFS: abort nfs_atomic_open_v23 if name is too long. An attempt to open a file with a name longer than NFS3_MAXNAMLEN will trigger a WARN_ON_ONCE in encode_filename3() because nfs_atomic_open_v23() doesn't have the test on ->d_name.len that nfs_atomic_open() has. So add that test. Reported-by: James Clark Closes: https://lore.kernel.org/all/20240528105249.69200-1-james.clark@arm.com/ Fixes: 7c6c5249f061 ("NFS: add atomic_open for NFSv3 to handle O_TRUNC correctly.") Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 788077a4feb9..2b68a14982c8 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2254,6 +2254,9 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry, */ int error = 0; + if (dentry->d_name.len > NFS_SERVER(dir)->namelen) + return -ENAMETOOLONG; + if (open_flags & O_CREAT) { file->f_mode |= FMODE_CREATED; error = nfs_do_create(dir, dentry, mode, open_flags); From 28568c906c1bb5f7560e18082ed7d6295860f1c2 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 29 May 2024 15:44:35 -0400 Subject: [PATCH 7/9] NFSv4.1 enforce rootpath check in fs_location query In commit 4ca9f31a2be66 ("NFSv4.1 test and add 4.1 trunking transport"), we introduce the ability to query the NFS server for possible trunking locations of the existing filesystem. However, we never checked the returned file system path for these alternative locations. According to the RFC, the server can say that the filesystem currently known under "fs_root" of fs_location also resides under these server locations under the following "rootpath" pathname. The client cannot handle trunking a filesystem that reside under different location under different paths other than what the main path is. This patch enforces the check that fs_root path and rootpath path in fs_location reply is the same. Fixes: 4ca9f31a2be6 ("NFSv4.1 test and add 4.1 trunking transport") Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 94c07875aa3f..a691fa10b3e9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4023,6 +4023,23 @@ static void test_fs_location_for_trunking(struct nfs4_fs_location *location, } } +static bool _is_same_nfs4_pathname(struct nfs4_pathname *path1, + struct nfs4_pathname *path2) +{ + int i; + + if (path1->ncomponents != path2->ncomponents) + return false; + for (i = 0; i < path1->ncomponents; i++) { + if (path1->components[i].len != path2->components[i].len) + return false; + if (memcmp(path1->components[i].data, path2->components[i].data, + path1->components[i].len)) + return false; + } + return true; +} + static int _nfs4_discover_trunking(struct nfs_server *server, struct nfs_fh *fhandle) { @@ -4056,9 +4073,13 @@ static int _nfs4_discover_trunking(struct nfs_server *server, if (status) goto out_free_3; - for (i = 0; i < locations->nlocations; i++) + for (i = 0; i < locations->nlocations; i++) { + if (!_is_same_nfs4_pathname(&locations->fs_path, + &locations->locations[i].rootpath)) + continue; test_fs_location_for_trunking(&locations->locations[i], clp, server); + } out_free_3: kfree(locations->fattr); out_free_2: From 33c94d7e3cb84f6d130678d6d59ba475a6c489cf Mon Sep 17 00:00:00 2001 From: Chen Hanxiao Date: Thu, 23 May 2024 16:47:16 +0800 Subject: [PATCH 8/9] SUNRPC: return proper error from gss_wrap_req_priv don't return 0 if snd_buf->len really greater than snd_buf->buflen Signed-off-by: Chen Hanxiao Fixes: 0c77668ddb4e ("SUNRPC: Introduce trace points in rpc_auth_gss.ko") Reviewed-by: Benjamin Coddington Reviewed-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index c7af0220f82f..369310909fc9 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1875,8 +1875,10 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages); /* slack space should prevent this ever happening: */ - if (unlikely(snd_buf->len > snd_buf->buflen)) + if (unlikely(snd_buf->len > snd_buf->buflen)) { + status = -EIO; goto wrap_failed; + } /* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was * done anyway, so it's safe to put the request on the wire: */ if (maj_stat == GSS_S_CONTEXT_EXPIRED) From 99bc9f2eb3f79a2b4296d9bf43153e1d10ca50d3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 28 May 2024 13:27:17 +1000 Subject: [PATCH 9/9] NFS: add barriers when testing for NFS_FSDATA_BLOCKED dentry->d_fsdata is set to NFS_FSDATA_BLOCKED while unlinking or renaming-over a file to ensure that no open succeeds while the NFS operation progressed on the server. Setting dentry->d_fsdata to NFS_FSDATA_BLOCKED is done under ->d_lock after checking the refcount is not elevated. Any attempt to open the file (through that name) will go through lookp_open() which will take ->d_lock while incrementing the refcount, we can be sure that once the new value is set, __nfs_lookup_revalidate() *will* see the new value and will block. We don't have any locking guarantee that when we set ->d_fsdata to NULL, the wait_var_event() in __nfs_lookup_revalidate() will notice. wait/wake primitives do NOT provide barriers to guarantee order. We must use smp_load_acquire() in wait_var_event() to ensure we look at an up-to-date value, and must use smp_store_release() before wake_up_var(). This patch adds those barrier functions and factors out block_revalidate() and unblock_revalidate() far clarity. There is also a hypothetical bug in that if memory allocation fails (which never happens in practice) we might leave ->d_fsdata locked. This patch adds the missing call to unblock_revalidate(). Reported-and-tested-by: Richard Kojedzinszky Closes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1071501 Fixes: 3c59366c207e ("NFS: don't unhash dentry during unlink/rename") Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2b68a14982c8..07a7be27182e 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1803,9 +1803,10 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags, if (parent != READ_ONCE(dentry->d_parent)) return -ECHILD; } else { - /* Wait for unlink to complete */ + /* Wait for unlink to complete - see unblock_revalidate() */ wait_var_event(&dentry->d_fsdata, - dentry->d_fsdata != NFS_FSDATA_BLOCKED); + smp_load_acquire(&dentry->d_fsdata) + != NFS_FSDATA_BLOCKED); parent = dget_parent(dentry); ret = reval(d_inode(parent), dentry, flags); dput(parent); @@ -1818,6 +1819,29 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate); } +static void block_revalidate(struct dentry *dentry) +{ + /* old devname - just in case */ + kfree(dentry->d_fsdata); + + /* Any new reference that could lead to an open + * will take ->d_lock in lookup_open() -> d_lookup(). + * Holding this lock ensures we cannot race with + * __nfs_lookup_revalidate() and removes and need + * for further barriers. + */ + lockdep_assert_held(&dentry->d_lock); + + dentry->d_fsdata = NFS_FSDATA_BLOCKED; +} + +static void unblock_revalidate(struct dentry *dentry) +{ + /* store_release ensures wait_var_event() sees the update */ + smp_store_release(&dentry->d_fsdata, NULL); + wake_up_var(&dentry->d_fsdata); +} + /* * A weaker form of d_revalidate for revalidating just the d_inode(dentry) * when we don't really care about the dentry name. This is called when a @@ -2551,15 +2575,12 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) spin_unlock(&dentry->d_lock); goto out; } - /* old devname */ - kfree(dentry->d_fsdata); - dentry->d_fsdata = NFS_FSDATA_BLOCKED; + block_revalidate(dentry); spin_unlock(&dentry->d_lock); error = nfs_safe_remove(dentry); nfs_dentry_remove_handle_error(dir, dentry, error); - dentry->d_fsdata = NULL; - wake_up_var(&dentry->d_fsdata); + unblock_revalidate(dentry); out: trace_nfs_unlink_exit(dir, dentry, error); return error; @@ -2666,8 +2687,7 @@ nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data) { struct dentry *new_dentry = data->new_dentry; - new_dentry->d_fsdata = NULL; - wake_up_var(&new_dentry->d_fsdata); + unblock_revalidate(new_dentry); } /* @@ -2729,11 +2749,6 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (WARN_ON(new_dentry->d_flags & DCACHE_NFSFS_RENAMED) || WARN_ON(new_dentry->d_fsdata == NFS_FSDATA_BLOCKED)) goto out; - if (new_dentry->d_fsdata) { - /* old devname */ - kfree(new_dentry->d_fsdata); - new_dentry->d_fsdata = NULL; - } spin_lock(&new_dentry->d_lock); if (d_count(new_dentry) > 2) { @@ -2755,7 +2770,7 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, new_dentry = dentry; new_inode = NULL; } else { - new_dentry->d_fsdata = NFS_FSDATA_BLOCKED; + block_revalidate(new_dentry); must_unblock = true; spin_unlock(&new_dentry->d_lock); } @@ -2767,6 +2782,8 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, must_unblock ? nfs_unblock_rename : NULL); if (IS_ERR(task)) { + if (must_unblock) + unblock_revalidate(new_dentry); error = PTR_ERR(task); goto out; }