From 80d34810815b1d708e3e59901a2afcdbd90c2a6f Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 27 Aug 2018 15:55:59 +0300 Subject: [PATCH 1/7] ovl: respect FIEMAP_FLAG_SYNC flag Stacked overlayfs fiemap operation broke xfstests that test delayed allocation (with "_test_generic_punch -d"), because ovl_fiemap() failed to write dirty pages when requested. Fixes: 9e142c4102db ("ovl: add ovl_fiemap()") Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index e0bb217c01e2..5014749fd4b4 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -467,6 +467,10 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -EOPNOTSUPP; old_cred = ovl_override_creds(inode->i_sb); + + if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) + filemap_write_and_wait(realinode->i_mapping); + err = realinode->i_op->fiemap(realinode, fieinfo, start, len); revert_creds(old_cred); From 5b910bd615ba947383e63cd1ed106ffa3060159e Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 27 Aug 2018 15:56:00 +0300 Subject: [PATCH 2/7] ovl: fix GPF in swapfile_activate of file from overlayfs over xfs Since overlayfs implements stacked file operations, the underlying filesystems are not supposed to be exposed to the overlayfs file, whose f_inode is an overlayfs inode. Assigning an overlayfs file to swap_file results in an attempt of xfs code to dereference an xfs_inode struct from an ovl_inode pointer: CPU: 0 PID: 2462 Comm: swapon Not tainted 4.18.0-xfstests-12721-g33e17876ea4e #3402 RIP: 0010:xfs_find_bdev_for_inode+0x23/0x2f Call Trace: xfs_iomap_swapfile_activate+0x1f/0x43 __se_sys_swapon+0xb1a/0xee9 Fix this by not assigning the real inode mapping to f_mapping, which will cause swapon() to return an error (-EINVAL). Although it makes sense not to allow setting swpafile on an overlayfs file, some users may depend on it, so we may need to fix this up in the future. Keeping f_mapping pointing to overlay inode mapping will cause O_DIRECT open to fail. Fix this by installing ovl_aops with noop_direct_IO in overlay inode mapping. Keeping f_mapping pointing to overlay inode mapping will cause other a_ops related operations to fail (e.g. readahead()). Those will be fixed by follow up patches. Suggested-by: Miklos Szeredi Fixes: f7c72396d0de ("ovl: add O_DIRECT support") Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/file.c | 3 --- fs/overlayfs/inode.c | 6 ++++++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 32e9282893c9..a4acd84591d4 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -131,9 +131,6 @@ static int ovl_open(struct inode *inode, struct file *file) if (IS_ERR(realfile)) return PTR_ERR(realfile); - /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */ - file->f_mapping = realfile->f_mapping; - file->private_data = realfile; return 0; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 5014749fd4b4..b6ac545b5a32 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -504,6 +504,11 @@ static const struct inode_operations ovl_special_inode_operations = { .update_time = ovl_update_time, }; +const struct address_space_operations ovl_aops = { + /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */ + .direct_IO = noop_direct_IO, +}; + /* * It is possible to stack overlayfs instance on top of another * overlayfs instance as lower layer. We need to annonate the @@ -575,6 +580,7 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev, case S_IFREG: inode->i_op = &ovl_file_inode_operations; inode->i_fop = &ovl_file_operations; + inode->i_mapping->a_ops = &ovl_aops; break; case S_IFDIR: From 17ef445f9befdc5c9adac270b18240ad24ee50ec Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 27 Aug 2018 15:56:01 +0300 Subject: [PATCH 3/7] Documentation/filesystems: update documentation of file_operations ...to kernel 4.18. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- Documentation/filesystems/vfs.txt | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 4b2084d0f1fb..ec2142c8dbd3 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -848,7 +848,7 @@ struct file_operations ---------------------- This describes how the VFS can manipulate an open file. As of kernel -4.1, the following members are defined: +4.18, the following members are defined: struct file_operations { struct module *owner; @@ -858,11 +858,11 @@ struct file_operations { ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iterate) (struct file *, struct dir_context *); + int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); - int (*mremap)(struct file *, struct vm_area_struct *); int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); @@ -882,6 +882,9 @@ struct file_operations { #ifndef CONFIG_MMU unsigned (*mmap_capabilities)(struct file *); #endif + ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); + int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, u64); + int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, u64); }; Again, all methods are called without any locks being held, unless @@ -899,6 +902,9 @@ otherwise noted. iterate: called when the VFS needs to read the directory contents + iterate_shared: called when the VFS needs to read the directory contents + when filesystem supports concurrent dir iterators + poll: called by the VFS when a process wants to check if there is activity on this file and (optionally) go to sleep until there is activity. Called by the select(2) and poll(2) system calls @@ -951,6 +957,14 @@ otherwise noted. fallocate: called by the VFS to preallocate blocks or punch a hole. + copy_file_range: called by the copy_file_range(2) system call. + + clone_file_range: called by the ioctl(2) system call for FICLONERANGE and + FICLONE commands. + + dedupe_file_range: called by the ioctl(2) system call for FIDEDUPERANGE + command. + Note that the file operations are implemented by the specific filesystem in which the inode resides. When opening a device node (character or block special) most filesystems will call special From 45cd0faae3715e305bc46e23b34c5ed4d185ceb8 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 27 Aug 2018 15:56:02 +0300 Subject: [PATCH 4/7] vfs: add the fadvise() file operation This is going to be used by overlayfs and possibly useful for other filesystems. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- Documentation/filesystems/vfs.txt | 3 ++ include/linux/fs.h | 5 ++ mm/fadvise.c | 78 ++++++++++++++++++------------- 3 files changed, 53 insertions(+), 33 deletions(-) diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index ec2142c8dbd3..a6c6a8af48a2 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -885,6 +885,7 @@ struct file_operations { ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, u64); int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, u64); + int (*fadvise)(struct file *, loff_t, loff_t, int); }; Again, all methods are called without any locks being held, unless @@ -965,6 +966,8 @@ otherwise noted. dedupe_file_range: called by the ioctl(2) system call for FIDEDUPERANGE command. + fadvise: possibly called by the fadvise64() system call. + Note that the file operations are implemented by the specific filesystem in which the inode resides. When opening a device node (character or block special) most filesystems will call special diff --git a/include/linux/fs.h b/include/linux/fs.h index 33322702c910..6c0b4a1c22ff 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1763,6 +1763,7 @@ struct file_operations { u64); int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, u64); + int (*fadvise)(struct file *, loff_t, loff_t, int); } __randomize_layout; struct inode_operations { @@ -3459,4 +3460,8 @@ static inline bool dir_relax_shared(struct inode *inode) extern bool path_noexec(const struct path *path); extern void inode_nohighmem(struct inode *inode); +/* mm/fadvise.c */ +extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, + int advice); + #endif /* _LINUX_FS_H */ diff --git a/mm/fadvise.c b/mm/fadvise.c index 2d8376e3c640..2f59bac1cb77 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -27,9 +27,9 @@ * deactivate the pages and clear PG_Referenced. */ -int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) +static int generic_fadvise(struct file *file, loff_t offset, loff_t len, + int advice) { - struct fd f = fdget(fd); struct inode *inode; struct address_space *mapping; struct backing_dev_info *bdi; @@ -37,22 +37,14 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) pgoff_t start_index; pgoff_t end_index; unsigned long nrpages; - int ret = 0; - if (!f.file) - return -EBADF; + inode = file_inode(file); + if (S_ISFIFO(inode->i_mode)) + return -ESPIPE; - inode = file_inode(f.file); - if (S_ISFIFO(inode->i_mode)) { - ret = -ESPIPE; - goto out; - } - - mapping = f.file->f_mapping; - if (!mapping || len < 0) { - ret = -EINVAL; - goto out; - } + mapping = file->f_mapping; + if (!mapping || len < 0) + return -EINVAL; bdi = inode_to_bdi(mapping->host); @@ -67,9 +59,9 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) /* no bad return value, but ignore advice */ break; default: - ret = -EINVAL; + return -EINVAL; } - goto out; + return 0; } /* @@ -85,21 +77,21 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) switch (advice) { case POSIX_FADV_NORMAL: - f.file->f_ra.ra_pages = bdi->ra_pages; - spin_lock(&f.file->f_lock); - f.file->f_mode &= ~FMODE_RANDOM; - spin_unlock(&f.file->f_lock); + file->f_ra.ra_pages = bdi->ra_pages; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_RANDOM: - spin_lock(&f.file->f_lock); - f.file->f_mode |= FMODE_RANDOM; - spin_unlock(&f.file->f_lock); + spin_lock(&file->f_lock); + file->f_mode |= FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_SEQUENTIAL: - f.file->f_ra.ra_pages = bdi->ra_pages * 2; - spin_lock(&f.file->f_lock); - f.file->f_mode &= ~FMODE_RANDOM; - spin_unlock(&f.file->f_lock); + file->f_ra.ra_pages = bdi->ra_pages * 2; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_WILLNEED: /* First and last PARTIAL page! */ @@ -115,8 +107,7 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) * Ignore return value because fadvise() shall return * success even if filesystem can't retrieve a hint, */ - force_page_cache_readahead(mapping, f.file, start_index, - nrpages); + force_page_cache_readahead(mapping, file, start_index, nrpages); break; case POSIX_FADV_NOREUSE: break; @@ -183,9 +174,30 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) } break; default: - ret = -EINVAL; + return -EINVAL; } -out: + return 0; +} + +int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice) +{ + if (file->f_op->fadvise) + return file->f_op->fadvise(file, offset, len, advice); + + return generic_fadvise(file, offset, len, advice); +} +EXPORT_SYMBOL(vfs_fadvise); + +int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) +{ + struct fd f = fdget(fd); + int ret; + + if (!f.file) + return -EBADF; + + ret = vfs_fadvise(f.file, offset, len, advice); + fdput(f); return ret; } From 3d8f7615319b2bca87a4815e13787439e3339a93 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 29 Aug 2018 08:41:29 +0300 Subject: [PATCH 5/7] vfs: implement readahead(2) using POSIX_FADV_WILLNEED The implementation of readahead(2) syscall is identical to that of fadvise64(POSIX_FADV_WILLNEED) with a few exceptions: 1. readahead(2) returns -EINVAL for !mapping->a_ops and fadvise64() ignores the request and returns 0. 2. fadvise64() checks for integer overflow corner case 3. fadvise64() calls the optional filesystem fadvise() file operation Unite the two implementations by calling vfs_fadvise() from readahead(2) syscall. Check the !mapping->a_ops in readahead(2) syscall to preserve documented syscall ABI behaviour. Suggested-by: Miklos Szeredi Fixes: d1d04ef8572b ("ovl: stack file ops") Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- mm/Makefile | 3 +-- mm/fadvise.c | 3 +++ mm/readahead.c | 45 +++++++++++++++++---------------------------- 3 files changed, 21 insertions(+), 30 deletions(-) diff --git a/mm/Makefile b/mm/Makefile index 8716bdabe1e6..26ef77a3883b 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -32,7 +32,7 @@ ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o endif -obj-y := filemap.o mempool.o oom_kill.o \ +obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ @@ -49,7 +49,6 @@ else obj-y += bootmem.o endif -obj-$(CONFIG_ADVISE_SYSCALLS) += fadvise.o ifdef CONFIG_MMU obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o endif diff --git a/mm/fadvise.c b/mm/fadvise.c index 2f59bac1cb77..467bcd032037 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -188,6 +188,8 @@ int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice) } EXPORT_SYMBOL(vfs_fadvise); +#ifdef CONFIG_ADVISE_SYSCALLS + int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) { struct fd f = fdget(fd); @@ -215,3 +217,4 @@ SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice) } #endif +#endif diff --git a/mm/readahead.c b/mm/readahead.c index a59ea70527b9..4e630143a0ba 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "internal.h" @@ -575,24 +576,6 @@ page_cache_async_readahead(struct address_space *mapping, } EXPORT_SYMBOL_GPL(page_cache_async_readahead); -static ssize_t -do_readahead(struct address_space *mapping, struct file *filp, - pgoff_t index, unsigned long nr) -{ - if (!mapping || !mapping->a_ops) - return -EINVAL; - - /* - * Readahead doesn't make sense for DAX inodes, but we don't want it - * to report a failure either. Instead, we just return success and - * don't do any work. - */ - if (dax_mapping(mapping)) - return 0; - - return force_page_cache_readahead(mapping, filp, index, nr); -} - ssize_t ksys_readahead(int fd, loff_t offset, size_t count) { ssize_t ret; @@ -600,16 +583,22 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count) ret = -EBADF; f = fdget(fd); - if (f.file) { - if (f.file->f_mode & FMODE_READ) { - struct address_space *mapping = f.file->f_mapping; - pgoff_t start = offset >> PAGE_SHIFT; - pgoff_t end = (offset + count - 1) >> PAGE_SHIFT; - unsigned long len = end - start + 1; - ret = do_readahead(mapping, f.file, start, len); - } - fdput(f); - } + if (!f.file || !(f.file->f_mode & FMODE_READ)) + goto out; + + /* + * The readahead() syscall is intended to run only on files + * that can execute readahead. If readahead is not possible + * on this file, then we must return -EINVAL. + */ + ret = -EINVAL; + if (!f.file->f_mapping || !f.file->f_mapping->a_ops || + !S_ISREG(file_inode(f.file)->i_mode)) + goto out; + + ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED); +out: + fdput(f); return ret; } From b833a3660394876541d2513ce2736debc7c6797a Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 28 Aug 2018 10:58:41 +0300 Subject: [PATCH 6/7] ovl: add ovl_fadvise() Implement stacked fadvise to fix syscalls readahead(2) and fadvise64(2) on an overlayfs file. Suggested-by: Miklos Szeredi Fixes: d1d04ef8572b ("ovl: stack file ops") Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/file.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index a4acd84591d4..aeaefd2a551b 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -331,6 +331,25 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len return ret; } +static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice) +{ + struct fd real; + const struct cred *old_cred; + int ret; + + ret = ovl_real_fdget(file, &real); + if (ret) + return ret; + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + ret = vfs_fadvise(real.file, offset, len, advice); + revert_creds(old_cred); + + fdput(real); + + return ret; +} + static long ovl_real_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -499,6 +518,7 @@ const struct file_operations ovl_file_operations = { .fsync = ovl_fsync, .mmap = ovl_mmap, .fallocate = ovl_fallocate, + .fadvise = ovl_fadvise, .unlocked_ioctl = ovl_ioctl, .compat_ioctl = ovl_compat_ioctl, From 8c25741aaad8be6fbe51510e917c740e0059cf83 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 10 Sep 2018 11:43:29 +0200 Subject: [PATCH 7/7] ovl: fix oopses in ovl_fill_super() failure paths ovl_free_fs() dereferences ofs->workbasedir and ofs->upper_mnt in cases when those might not have been initialized yet. Fix the initialization order for these fields. Reported-by: syzbot+c75f181dc8429d2eb887@syzkaller.appspotmail.com Signed-off-by: Miklos Szeredi Cc: # v4.15 Fixes: 95e6d4177cb7 ("ovl: grab reference to workbasedir early") Fixes: a9075cdb467d ("ovl: factor out ovl_free_fs() helper") --- fs/overlayfs/super.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 2e0fc93c2c06..30adc9d408a0 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -982,16 +982,6 @@ static int ovl_get_upper(struct ovl_fs *ofs, struct path *upperpath) if (err) goto out; - err = -EBUSY; - if (ovl_inuse_trylock(upperpath->dentry)) { - ofs->upperdir_locked = true; - } else if (ofs->config.index) { - pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n"); - goto out; - } else { - pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); - } - upper_mnt = clone_private_mount(upperpath); err = PTR_ERR(upper_mnt); if (IS_ERR(upper_mnt)) { @@ -1002,6 +992,17 @@ static int ovl_get_upper(struct ovl_fs *ofs, struct path *upperpath) /* Don't inherit atime flags */ upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME); ofs->upper_mnt = upper_mnt; + + err = -EBUSY; + if (ovl_inuse_trylock(ofs->upper_mnt->mnt_root)) { + ofs->upperdir_locked = true; + } else if (ofs->config.index) { + pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n"); + goto out; + } else { + pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); + } + err = 0; out: return err; @@ -1101,8 +1102,10 @@ static int ovl_get_workdir(struct ovl_fs *ofs, struct path *upperpath) goto out; } + ofs->workbasedir = dget(workpath.dentry); + err = -EBUSY; - if (ovl_inuse_trylock(workpath.dentry)) { + if (ovl_inuse_trylock(ofs->workbasedir)) { ofs->workdir_locked = true; } else if (ofs->config.index) { pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n"); @@ -1111,7 +1114,6 @@ static int ovl_get_workdir(struct ovl_fs *ofs, struct path *upperpath) pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); } - ofs->workbasedir = dget(workpath.dentry); err = ovl_make_workdir(ofs, &workpath); if (err) goto out;