summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode.c3
-rw-r--r--fs/9p/vfs_inode_dotl.c3
-rw-r--r--fs/aio.c55
-rw-r--r--fs/autofs4/waitq.c15
-rw-r--r--fs/binfmt_elf_fdpic.c2
-rw-r--r--fs/ceph/caps.c9
-rw-r--r--fs/ceph/inode.c9
-rw-r--r--fs/ceph/locks.c177
-rw-r--r--fs/ceph/mds_client.c96
-rw-r--r--fs/ceph/super.c5
-rw-r--r--fs/ceph/super.h4
-rw-r--r--fs/coda/upcall.c3
-rw-r--r--fs/compat_ioctl.c123
-rw-r--r--fs/coredump.c7
-rw-r--r--fs/cramfs/Kconfig39
-rw-r--r--fs/cramfs/README31
-rw-r--r--fs/cramfs/inode.c511
-rw-r--r--fs/dax.c319
-rw-r--r--fs/ecryptfs/crypto.c44
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h9
-rw-r--r--fs/ecryptfs/inode.c4
-rw-r--r--fs/ecryptfs/keystore.c48
-rw-r--r--fs/ecryptfs/main.c4
-rw-r--r--fs/ecryptfs/messaging.c13
-rw-r--r--fs/ecryptfs/miscdev.c8
-rw-r--r--fs/ecryptfs/mmap.c2
-rw-r--r--fs/eventpoll.c135
-rw-r--r--fs/ext2/file.c2
-rw-r--r--fs/ext4/file.c26
-rw-r--r--fs/ext4/inode.c15
-rw-r--r--fs/ext4/ioctl.c86
-rw-r--r--fs/fat/dir.c1
-rw-r--r--fs/fcntl.c16
-rw-r--r--fs/fhandle.c4
-rw-r--r--fs/file.c12
-rw-r--r--fs/hfs/bnode.c4
-rw-r--r--fs/hfsplus/bnode.c4
-rw-r--r--fs/internal.h1
-rw-r--r--fs/iomap.c24
-rw-r--r--fs/jbd2/journal.c17
-rw-r--r--fs/lockd/svc.c20
-rw-r--r--fs/namei.c14
-rw-r--r--fs/nfs/cache_lib.c6
-rw-r--r--fs/nfs/cache_lib.h2
-rw-r--r--fs/nfs/callback.c14
-rw-r--r--fs/nfs/callback_proc.c2
-rw-r--r--fs/nfs/client.c10
-rw-r--r--fs/nfs/delegation.c27
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c50
-rw-r--r--fs/nfs/file.c18
-rw-r--r--fs/nfs/filelayout/filelayout.c12
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c20
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h3
-rw-r--r--fs/nfs/inode.c16
-rw-r--r--fs/nfs/nfs3proc.c17
-rw-r--r--fs/nfs/nfs4_fs.h12
-rw-r--r--fs/nfs/nfs4client.c12
-rw-r--r--fs/nfs/nfs4proc.c511
-rw-r--r--fs/nfs/nfs4state.c53
-rw-r--r--fs/nfs/nfs4trace.h26
-rw-r--r--fs/nfs/nfs4xdr.c12
-rw-r--r--fs/nfs/pnfs.c44
-rw-r--r--fs/nfs/pnfs.h15
-rw-r--r--fs/nfs/pnfs_nfs.c10
-rw-r--r--fs/nfs/super.c14
-rw-r--r--fs/nfs/write.c17
-rw-r--r--fs/nfs_common/grace.c24
-rw-r--r--fs/nfsd/fault_inject.c5
-rw-r--r--fs/nfsd/netns.h2
-rw-r--r--fs/nfsd/nfs3xdr.c10
-rw-r--r--fs/nfsd/nfs4layouts.c4
-rw-r--r--fs/nfsd/nfs4proc.c19
-rw-r--r--fs/nfsd/nfs4state.c127
-rw-r--r--fs/nfsd/nfssvc.c4
-rw-r--r--fs/nfsd/state.h11
-rw-r--r--fs/nfsd/xdr4.h13
-rw-r--r--fs/nilfs2/namei.c2
-rw-r--r--fs/nilfs2/segment.c17
-rw-r--r--fs/nilfs2/segment.h1
-rw-r--r--fs/nilfs2/sufile.c32
-rw-r--r--fs/nilfs2/super.c1
-rw-r--r--fs/nilfs2/the_nilfs.c8
-rw-r--r--fs/nilfs2/the_nilfs.h5
-rw-r--r--fs/orangefs/acl.c10
-rw-r--r--fs/orangefs/dir.c1
-rw-r--r--fs/orangefs/file.c16
-rw-r--r--fs/orangefs/inode.c17
-rw-r--r--fs/orangefs/namei.c45
-rw-r--r--fs/orangefs/orangefs-debug.h4
-rw-r--r--fs/orangefs/orangefs-kernel.h37
-rw-r--r--fs/orangefs/orangefs-utils.c86
-rw-r--r--fs/orangefs/super.c15
-rw-r--r--fs/orangefs/symlink.c1
-rw-r--r--fs/overlayfs/copy_up.c8
-rw-r--r--fs/overlayfs/dir.c25
-rw-r--r--fs/overlayfs/inode.c63
-rw-r--r--fs/overlayfs/namei.c59
-rw-r--r--fs/overlayfs/overlayfs.h13
-rw-r--r--fs/overlayfs/ovl_entry.h14
-rw-r--r--fs/overlayfs/readdir.c55
-rw-r--r--fs/overlayfs/super.c688
-rw-r--r--fs/overlayfs/util.c21
-rw-r--r--fs/pipe.c23
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/array.c6
-rw-r--r--fs/proc/base.c2
-rw-r--r--fs/proc/cpuinfo.c6
-rw-r--r--fs/proc/internal.h25
-rw-r--r--fs/proc/loadavg.c2
-rw-r--r--fs/proc/task_mmu.c1
-rw-r--r--fs/proc/util.c23
-rw-r--r--fs/pstore/platform.c2
-rw-r--r--fs/read_write.c21
-rw-r--r--fs/select.c68
-rw-r--r--fs/signalfd.c4
-rw-r--r--fs/statfs.c2
-rw-r--r--fs/super.c46
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c4
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c8
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h4
-rw-r--r--fs/xfs/xfs_file.c44
-rw-r--r--fs/xfs/xfs_inode.c1
-rw-r--r--fs/xfs/xfs_iomap.c5
-rw-r--r--fs/xfs/xfs_linux.h10
-rw-r--r--fs/xfs/xfs_trace.h2
126 files changed, 2785 insertions, 1836 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 2a5de610dd8f..bdabb2765d1b 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -483,6 +483,9 @@ static int v9fs_test_inode(struct inode *inode, void *data)
if (v9inode->qid.type != st->qid.type)
return 0;
+
+ if (v9inode->qid.path != st->qid.path)
+ return 0;
return 1;
}
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 70f9887c59a9..7f6ae21a27b3 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -87,6 +87,9 @@ static int v9fs_test_inode_dotl(struct inode *inode, void *data)
if (v9inode->qid.type != st->qid.type)
return 0;
+
+ if (v9inode->qid.path != st->qid.path)
+ return 0;
return 1;
}
diff --git a/fs/aio.c b/fs/aio.c
index e6de7715228c..a062d75109cb 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1297,20 +1297,10 @@ static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
static long read_events(struct kioctx *ctx, long min_nr, long nr,
struct io_event __user *event,
- struct timespec __user *timeout)
+ ktime_t until)
{
- ktime_t until = KTIME_MAX;
long ret = 0;
- if (timeout) {
- struct timespec ts;
-
- if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
- return -EFAULT;
-
- until = timespec_to_ktime(ts);
- }
-
/*
* Note that aio_read_events() is being called as the conditional - i.e.
* we're calling it after prepare_to_wait() has set task state to
@@ -1826,6 +1816,25 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
return ret;
}
+static long do_io_getevents(aio_context_t ctx_id,
+ long min_nr,
+ long nr,
+ struct io_event __user *events,
+ struct timespec64 *ts)
+{
+ ktime_t until = ts ? timespec64_to_ktime(*ts) : KTIME_MAX;
+ struct kioctx *ioctx = lookup_ioctx(ctx_id);
+ long ret = -EINVAL;
+
+ if (likely(ioctx)) {
+ if (likely(min_nr <= nr && min_nr >= 0))
+ ret = read_events(ioctx, min_nr, nr, events, until);
+ percpu_ref_put(&ioctx->users);
+ }
+
+ return ret;
+}
+
/* io_getevents:
* Attempts to read at least min_nr events and up to nr events from
* the completion queue for the aio_context specified by ctx_id. If
@@ -1844,15 +1853,14 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
struct io_event __user *, events,
struct timespec __user *, timeout)
{
- struct kioctx *ioctx = lookup_ioctx(ctx_id);
- long ret = -EINVAL;
+ struct timespec64 ts;
- if (likely(ioctx)) {
- if (likely(min_nr <= nr && min_nr >= 0))
- ret = read_events(ioctx, min_nr, nr, events, timeout);
- percpu_ref_put(&ioctx->users);
+ if (timeout) {
+ if (unlikely(get_timespec64(&ts, timeout)))
+ return -EFAULT;
}
- return ret;
+
+ return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
}
#ifdef CONFIG_COMPAT
@@ -1862,17 +1870,14 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
struct io_event __user *, events,
struct compat_timespec __user *, timeout)
{
- struct timespec t;
- struct timespec __user *ut = NULL;
+ struct timespec64 t;
if (timeout) {
- if (compat_get_timespec(&t, timeout))
+ if (compat_get_timespec64(&t, timeout))
return -EFAULT;
- ut = compat_alloc_user_space(sizeof(*ut));
- if (copy_to_user(ut, &t, sizeof(t)))
- return -EFAULT;
}
- return sys_io_getevents(ctx_id, min_nr, nr, events, ut);
+
+ return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
}
#endif
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 4ac49d038bf3..8fc41705c7cd 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -81,7 +81,8 @@ static int autofs4_write(struct autofs_sb_info *sbi,
spin_unlock_irqrestore(&current->sighand->siglock, flags);
}
- return (bytes > 0);
+ /* if 'wr' returned 0 (impossible) we assume -EIO (safe) */
+ return bytes == 0 ? 0 : wr < 0 ? wr : -EIO;
}
static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
@@ -95,6 +96,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
} pkt;
struct file *pipe = NULL;
size_t pktsz;
+ int ret;
pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n",
(unsigned long) wq->wait_queue_token,
@@ -169,7 +171,18 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
mutex_unlock(&sbi->wq_mutex);
if (autofs4_write(sbi, pipe, &pkt, pktsz))
+ switch (ret = autofs4_write(sbi, pipe, &pkt, pktsz)) {
+ case 0:
+ break;
+ case -ENOMEM:
+ case -ERESTARTSYS:
+ /* Just fail this one */
+ autofs4_wait_release(sbi, wq->wait_queue_token, ret);
+ break;
+ default:
autofs4_catatonic_mode(sbi);
+ break;
+ }
fput(pipe);
}
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 5429b035e249..429326b6e2e7 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1498,7 +1498,9 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm)
struct vm_area_struct *vma;
for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
+#ifdef CONFIG_MMU
unsigned long addr;
+#endif
if (!maydump(vma, cprm->mm_flags))
continue;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ff5d32cf9578..a14b2c974c9e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1160,7 +1160,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
struct ceph_inode_info *ci = cap->ci;
struct inode *inode = &ci->vfs_inode;
struct cap_msg_args arg;
- int held, revoking, dropping;
+ int held, revoking;
int wake = 0;
int delayed = 0;
int ret;
@@ -1168,7 +1168,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
held = cap->issued | cap->implemented;
revoking = cap->implemented & ~cap->issued;
retain &= ~revoking;
- dropping = cap->issued & ~retain;
dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
inode, cap, cap->session,
@@ -1712,7 +1711,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
/* if we are unmounting, flush any unused caps immediately. */
if (mdsc->stopping)
- is_delayed = 1;
+ is_delayed = true;
spin_lock(&ci->i_ceph_lock);
@@ -3189,8 +3188,8 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
int dirty = le32_to_cpu(m->dirty);
int cleaned = 0;
bool drop = false;
- bool wake_ci = 0;
- bool wake_mdsc = 0;
+ bool wake_ci = false;
+ bool wake_mdsc = false;
list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
if (cf->tid == flush_tid)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index f2550a076edc..ab81652198c4 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -493,6 +493,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_wb_ref = 0;
ci->i_wrbuffer_ref = 0;
ci->i_wrbuffer_ref_head = 0;
+ atomic_set(&ci->i_filelock_ref, 0);
ci->i_shared_gen = 0;
ci->i_rdcache_gen = 0;
ci->i_rdcache_revoking = 0;
@@ -786,7 +787,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
/* update inode */
ci->i_version = le64_to_cpu(info->version);
- inode->i_version++;
inode->i_rdev = le32_to_cpu(info->rdev);
inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
@@ -1185,6 +1185,7 @@ retry_lookup:
ceph_snap(d_inode(dn)) != tvino.snap)) {
dout(" dn %p points to wrong inode %p\n",
dn, d_inode(dn));
+ ceph_dir_clear_ordered(dir);
d_delete(dn);
dput(dn);
goto retry_lookup;
@@ -1322,6 +1323,7 @@ retry_lookup:
dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
dn, d_inode(dn), ceph_vinop(d_inode(dn)),
ceph_vinop(in));
+ ceph_dir_clear_ordered(dir);
d_invalidate(dn);
have_lease = false;
}
@@ -1573,6 +1575,7 @@ retry_lookup:
ceph_snap(d_inode(dn)) != tvino.snap)) {
dout(" dn %p points to wrong inode %p\n",
dn, d_inode(dn));
+ __ceph_dir_clear_ordered(ci);
d_delete(dn);
dput(dn);
goto retry_lookup;
@@ -1597,7 +1600,9 @@ retry_lookup:
&req->r_caps_reservation);
if (ret < 0) {
pr_err("fill_inode badness on %p\n", in);
- if (d_really_is_negative(dn))
+ if (d_really_is_positive(dn))
+ __ceph_dir_clear_ordered(ci);
+ else
iput(in);
d_drop(dn);
err = ret;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index e7cce412f2cf..9e66f69ee8a5 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -30,19 +30,52 @@ void __init ceph_flock_init(void)
get_random_bytes(&lock_secret, sizeof(lock_secret));
}
+static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
+{
+ struct inode *inode = file_inode(src->fl_file);
+ atomic_inc(&ceph_inode(inode)->i_filelock_ref);
+}
+
+static void ceph_fl_release_lock(struct file_lock *fl)
+{
+ struct inode *inode = file_inode(fl->fl_file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ if (atomic_dec_and_test(&ci->i_filelock_ref)) {
+ /* clear error when all locks are released */
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
+ spin_unlock(&ci->i_ceph_lock);
+ }
+}
+
+static const struct file_lock_operations ceph_fl_lock_ops = {
+ .fl_copy_lock = ceph_fl_copy_lock,
+ .fl_release_private = ceph_fl_release_lock,
+};
+
/**
* Implement fcntl and flock locking functions.
*/
-static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
+static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
int cmd, u8 wait, struct file_lock *fl)
{
- struct inode *inode = file_inode(file);
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
int err;
u64 length = 0;
u64 owner;
+ if (operation == CEPH_MDS_OP_SETFILELOCK) {
+ /*
+ * increasing i_filelock_ref closes race window between
+ * handling request reply and adding file_lock struct to
+ * inode. Otherwise, auth caps may get trimmed in the
+ * window. Caller function will decrease the counter.
+ */
+ fl->fl_ops = &ceph_fl_lock_ops;
+ atomic_inc(&ceph_inode(inode)->i_filelock_ref);
+ }
+
if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)
wait = 0;
@@ -180,10 +213,12 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
*/
int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
{
- u8 lock_cmd;
- int err;
- u8 wait = 0;
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int err = 0;
u16 op = CEPH_MDS_OP_SETFILELOCK;
+ u8 wait = 0;
+ u8 lock_cmd;
if (!(fl->fl_flags & FL_POSIX))
return -ENOLCK;
@@ -199,6 +234,26 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
else if (IS_SETLKW(cmd))
wait = 1;
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
+ err = -EIO;
+ } else if (op == CEPH_MDS_OP_SETFILELOCK) {
+ /*
+ * increasing i_filelock_ref closes race window between
+ * handling request reply and adding file_lock struct to
+ * inode. Otherwise, i_auth_cap may get trimmed in the
+ * window. Caller function will decrease the counter.
+ */
+ fl->fl_ops = &ceph_fl_lock_ops;
+ atomic_inc(&ci->i_filelock_ref);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ if (err < 0) {
+ if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type)
+ posix_lock_file(file, fl, NULL);
+ return err;
+ }
+
if (F_RDLCK == fl->fl_type)
lock_cmd = CEPH_LOCK_SHARED;
else if (F_WRLCK == fl->fl_type)
@@ -206,16 +261,16 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
else
lock_cmd = CEPH_LOCK_UNLOCK;
- err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
+ err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
if (!err) {
- if (op != CEPH_MDS_OP_GETFILELOCK) {
+ if (op == CEPH_MDS_OP_SETFILELOCK) {
dout("mds locked, locking locally");
err = posix_lock_file(file, fl, NULL);
- if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
+ if (err) {
/* undo! This should only happen if
* the kernel detects local
* deadlock. */
- ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+ ceph_lock_message(CEPH_LOCK_FCNTL, op, inode,
CEPH_LOCK_UNLOCK, 0, fl);
dout("got %d on posix_lock_file, undid lock",
err);
@@ -227,9 +282,11 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
{
- u8 lock_cmd;
- int err;
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int err = 0;
u8 wait = 0;
+ u8 lock_cmd;
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
@@ -239,6 +296,21 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
dout("ceph_flock, fl_file: %p", fl->fl_file);
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
+ err = -EIO;
+ } else {
+ /* see comment in ceph_lock */
+ fl->fl_ops = &ceph_fl_lock_ops;
+ atomic_inc(&ci->i_filelock_ref);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ if (err < 0) {
+ if (F_UNLCK == fl->fl_type)
+ locks_lock_file_wait(file, fl);
+ return err;
+ }
+
if (IS_SETLKW(cmd))
wait = 1;
@@ -250,13 +322,13 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
lock_cmd = CEPH_LOCK_UNLOCK;
err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
- file, lock_cmd, wait, fl);
+ inode, lock_cmd, wait, fl);
if (!err) {
err = locks_lock_file_wait(file, fl);
if (err) {
ceph_lock_message(CEPH_LOCK_FLOCK,
CEPH_MDS_OP_SETFILELOCK,
- file, CEPH_LOCK_UNLOCK, 0, fl);
+ inode, CEPH_LOCK_UNLOCK, 0, fl);
dout("got %d on locks_lock_file_wait, undid lock", err);
}
}
@@ -288,6 +360,37 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
*flock_count, *fcntl_count);
}
+/*
+ * Given a pointer to a lock, convert it to a ceph filelock
+ */
+static int lock_to_ceph_filelock(struct file_lock *lock,
+ struct ceph_filelock *cephlock)
+{
+ int err = 0;
+ cephlock->start = cpu_to_le64(lock->fl_start);
+ cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
+ cephlock->client = cpu_to_le64(0);
+ cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
+ cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
+
+ switch (lock->fl_type) {
+ case F_RDLCK:
+ cephlock->type = CEPH_LOCK_SHARED;
+ break;
+ case F_WRLCK:
+ cephlock->type = CEPH_LOCK_EXCL;
+ break;
+ case F_UNLCK:
+ cephlock->type = CEPH_LOCK_UNLOCK;
+ break;
+ default:
+ dout("Have unknown lock type %d", lock->fl_type);
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
/**
* Encode the flock and fcntl locks for the given inode into the ceph_filelock
* array. Must be called with inode->i_lock already held.
@@ -356,50 +459,22 @@ int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
if (err)
goto out_fail;
- err = ceph_pagelist_append(pagelist, flocks,
- num_fcntl_locks * sizeof(*flocks));
- if (err)
- goto out_fail;
+ if (num_fcntl_locks > 0) {
+ err = ceph_pagelist_append(pagelist, flocks,
+ num_fcntl_locks * sizeof(*flocks));
+ if (err)
+ goto out_fail;
+ }
nlocks = cpu_to_le32(num_flock_locks);
err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
if (err)
goto out_fail;
- err = ceph_pagelist_append(pagelist,
- &flocks[num_fcntl_locks],
- num_flock_locks * sizeof(*flocks));
-out_fail:
- return err;
-}
-
-/*
- * Given a pointer to a lock, convert it to a ceph filelock
- */
-int lock_to_ceph_filelock(struct file_lock *lock,
- struct ceph_filelock *cephlock)
-{
- int err = 0;
- cephlock->start = cpu_to_le64(lock->fl_start);
- cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
- cephlock->client = cpu_to_le64(0);
- cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
- cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
-
- switch (lock->fl_type) {
- case F_RDLCK:
- cephlock->type = CEPH_LOCK_SHARED;
- break;
- case F_WRLCK:
- cephlock->type = CEPH_LOCK_EXCL;
- break;
- case F_UNLCK:
- cephlock->type = CEPH_LOCK_UNLOCK;
- break;
- default:
- dout("Have unknown lock type %d", lock->fl_type);
- err = -EINVAL;
+ if (num_flock_locks > 0) {
+ err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks],
+ num_flock_locks * sizeof(*flocks));
}
-
+out_fail:
return err;
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 0687ab3c3267..ab69dcb70e8a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1039,22 +1039,23 @@ void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
* session caps
*/
-/* caller holds s_cap_lock, we drop it */
-static void cleanup_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
- __releases(session->s_cap_lock)
+static void detach_cap_releases(struct ceph_mds_session *session,
+ struct list_head *target)
{
- LIST_HEAD(tmp_list);
- list_splice_init(&session->s_cap_releases, &tmp_list);
+ lockdep_assert_held(&session->s_cap_lock);
+
+ list_splice_init(&session->s_cap_releases, target);
session->s_num_cap_releases = 0;
- spin_unlock(&session->s_cap_lock);
+ dout("dispose_cap_releases mds%d\n", session->s_mds);
+}
- dout("cleanup_cap_releases mds%d\n", session->s_mds);
- while (!list_empty(&tmp_list)) {
+static void dispose_cap_releases(struct ceph_mds_client *mdsc,
+ struct list_head *dispose)
+{
+ while (!list_empty(dispose)) {
struct ceph_cap *cap;
/* zero out the in-progress message */
- cap = list_first_entry(&tmp_list,
- struct ceph_cap, session_caps);
+ cap = list_first_entry(dispose, struct ceph_cap, session_caps);
list_del(&cap->session_caps);
ceph_put_cap(mdsc, cap);
}
@@ -1215,6 +1216,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
}
spin_unlock(&mdsc->cap_dirty_lock);
+ if (atomic_read(&ci->i_filelock_ref) > 0) {
+ /* make further file lock syscall return -EIO */
+ ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
+ pr_warn_ratelimited(" dropping file locks for %p %lld\n",
+ inode, ceph_ino(inode));
+ }
+
if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
ci->i_prealloc_cap_flush = NULL;
@@ -1244,6 +1252,8 @@ static void remove_session_caps(struct ceph_mds_session *session)
{
struct ceph_fs_client *fsc = session->s_mdsc->fsc;
struct super_block *sb = fsc->sb;
+ LIST_HEAD(dispose);
+
dout("remove_session_caps on %p\n", session);
iterate_session_caps(session, remove_session_caps_cb, fsc);
@@ -1278,10 +1288,12 @@ static void remove_session_caps(struct ceph_mds_session *session)
}
// drop cap expires and unlock s_cap_lock
- cleanup_cap_releases(session->s_mdsc, session);
+ detach_cap_releases(session, &dispose);
BUG_ON(session->s_nr_caps > 0);
BUG_ON(!list_empty(&session->s_cap_flushing));
+ spin_unlock(&session->s_cap_lock);
+ dispose_cap_releases(session->s_mdsc, &dispose);
}
/*
@@ -1462,6 +1474,11 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
goto out;
if ((used | wanted) & CEPH_CAP_ANY_WR)
goto out;
+ /* Note: it's possible that i_filelock_ref becomes non-zero
+ * after dropping auth caps. It doesn't hurt because reply
+ * of lock mds request will re-add auth caps. */
+ if (atomic_read(&ci->i_filelock_ref) > 0)
+ goto out;
}
/* The inode has cached pages, but it's no longer used.
* we can safely drop it */
@@ -2827,7 +2844,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
struct ceph_mds_cap_reconnect v2;
struct ceph_mds_cap_reconnect_v1 v1;
} rec;
- struct ceph_inode_info *ci;
+ struct ceph_inode_info *ci = cap->ci;
struct ceph_reconnect_state *recon_state = arg;
struct ceph_pagelist *pagelist = recon_state->pagelist;
char *path;
@@ -2836,8 +2853,6 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
u64 snap_follows;
struct dentry *dentry;
- ci = cap->ci;
-
dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
inode, ceph_vinop(inode), cap, cap->cap_id,
ceph_cap_string(cap->issued));
@@ -2870,7 +2885,8 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
rec.v2.issued = cpu_to_le32(cap->issued);
rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
rec.v2.pathbase = cpu_to_le64(pathbase);
- rec.v2.flock_len = 0;
+ rec.v2.flock_len = (__force __le32)
+ ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
} else {
rec.v1.cap_id = cpu_to_le64(cap->cap_id);
rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
@@ -2894,26 +2910,37 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
if (recon_state->msg_version >= 2) {
int num_fcntl_locks, num_flock_locks;
- struct ceph_filelock *flocks;
+ struct ceph_filelock *flocks = NULL;
size_t struct_len, total_len = 0;
u8 struct_v = 0;
encode_again:
- ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
- flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
- sizeof(struct ceph_filelock), GFP_NOFS);
- if (!flocks) {
- err = -ENOMEM;
- goto out_free;
+ if (rec.v2.flock_len) {
+ ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
+ } else {
+ num_fcntl_locks = 0;
+ num_flock_locks = 0;
}
- err = ceph_encode_locks_to_buffer(inode, flocks,
- num_fcntl_locks,
- num_flock_locks);
- if (err) {
+ if (num_fcntl_locks + num_flock_locks > 0) {
+ flocks = kmalloc((num_fcntl_locks + num_flock_locks) *
+ sizeof(struct ceph_filelock), GFP_NOFS);
+ if (!flocks) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+ err = ceph_encode_locks_to_buffer(inode, flocks,
+ num_fcntl_locks,
+ num_flock_locks);
+ if (err) {
+ kfree(flocks);
+ flocks = NULL;
+ if (err == -ENOSPC)
+ goto encode_again;
+ goto out_free;
+ }
+ } else {
kfree(flocks);
- if (err == -ENOSPC)
- goto encode_again;
- goto out_free;
+ flocks = NULL;
}
if (recon_state->msg_version >= 3) {
@@ -2993,6 +3020,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
int s_nr_caps;
struct ceph_pagelist *pagelist;
struct ceph_reconnect_state recon_state;
+ LIST_HEAD(dispose);
pr_info("mds%d reconnect start\n", mds);
@@ -3026,7 +3054,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
*/
session->s_cap_reconnect = 1;
/* drop old cap expires; we're about to reestablish that state */
- cleanup_cap_releases(mdsc, session);
+ detach_cap_releases(session, &dispose);
+ spin_unlock(&session->s_cap_lock);
+ dispose_cap_releases(mdsc, &dispose);
/* trim unused caps to reduce MDS's cache rejoin time */
if (mdsc->fsc->sb->s_root)
@@ -3857,14 +3887,14 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
goto err_out;
}
return;
+
bad:
pr_err("error decoding fsmap\n");
err_out:
mutex_lock(&mdsc->mutex);
- mdsc->mdsmap_err = -ENOENT;
+ mdsc->mdsmap_err = err;
__wake_requests(mdsc, &mdsc->waiting_for_map);
mutex_unlock(&mdsc->mutex);
- return;
}
/*
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index e4082afedcb1..fe9fbb3f13f7 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -84,8 +84,9 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_ffree = -1;
buf->f_namelen = NAME_MAX;
- /* leave fsid little-endian, regardless of host endianness */
- fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
+ /* Must convert the fsid, for consistent values across arches */
+ fsid = le64_to_cpu(*(__le64 *)(&monmap->fsid)) ^
+ le64_to_cpu(*((__le64 *)&monmap->fsid + 1));
buf->f_fsid.val[0] = fsid & 0xffffffff;
buf->f_fsid.val[1] = fsid >> 32;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 3e27a28aa44a..2beeec07fa76 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -352,6 +352,7 @@ struct ceph_inode_info {
int i_pin_ref;
int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref;
int i_wrbuffer_ref, i_wrbuffer_ref_head;
+ atomic_t i_filelock_ref;
u32 i_shared_gen; /* increment each time we get FILE_SHARED */
u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */
u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
@@ -487,6 +488,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
#define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */
#define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */
#define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */
+#define CEPH_I_ERROR_FILELOCK (1 << 12) /* have seen file lock errors */
+
/*
* We set the ERROR_WRITE bit when we start seeing write errors on an inode
@@ -1011,7 +1014,6 @@ extern int ceph_encode_locks_to_buffer(struct inode *inode,
extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
struct ceph_pagelist *pagelist,
int num_fcntl_locks, int num_flock_locks);
-extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
/* debugfs.c */
extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index a37f003530d7..1175a1722411 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -447,8 +447,7 @@ int venus_fsync(struct super_block *sb, struct CodaFid *fid)
UPARG(CODA_FSYNC);
inp->coda_fsync.VFid = *fid;
- error = coda_upcall(coda_vcp(sb), sizeof(union inputArgs),
- &outsize, inp);
+ error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
CODA_FREE(inp, insize);
return error;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index bd5d91e119ca..5fc5dc660600 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -54,8 +54,6 @@
#include <linux/if_tun.h>
#include <linux/ctype.h>
#include <linux/syscalls.h>
-#include <linux/i2c.h>
-#include <linux/i2c-dev.h>
#include <linux/atalk.h>
#include <linux/gfp.h>
#include <linux/cec.h>
@@ -137,22 +135,6 @@ static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return vfs_ioctl(file, cmd, arg);
}
-static int w_long(struct file *file,
- unsigned int cmd, compat_ulong_t __user *argp)
-{
- int err;
- unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp));
-
- if (valp == NULL)
- return -EFAULT;
- err = do_ioctl(file, cmd, (unsigned long)valp);
- if (err)
- return err;
- if (convert_in_user(valp, argp))
- return -EFAULT;
- return 0;
-}
-
struct compat_video_event {
int32_t type;
compat_time_t timestamp;
@@ -671,96 +653,6 @@ static int serial_struct_ioctl(struct file *file,
return err;
}
-/*
- * I2C layer ioctls
- */
-
-struct i2c_msg32 {
- u16 addr;
- u16 flags;
- u16 len;
- compat_caddr_t buf;
-};
-
-struct i2c_rdwr_ioctl_data32 {
- compat_caddr_t msgs; /* struct i2c_msg __user *msgs */
- u32 nmsgs;
-};
-
-struct i2c_smbus_ioctl_data32 {
- u8 read_write;
- u8 command;
- u32 size;
- compat_caddr_t data; /* union i2c_smbus_data *data */
-};
-
-struct i2c_rdwr_aligned {
- struct i2c_rdwr_ioctl_data cmd;
- struct i2c_msg msgs[0];
-};
-
-static int do_i2c_rdwr_ioctl(struct file *file,
- unsigned int cmd, struct i2c_rdwr_ioctl_data32 __user *udata)
-{
- struct i2c_rdwr_aligned __user *tdata;
- struct i2c_msg __user *tmsgs;
- struct i2c_msg32 __user *umsgs;
- compat_caddr_t datap;
- u32 nmsgs;
- int i;
-
- if (get_user(nmsgs, &udata->nmsgs))
- return -EFAULT;
- if (nmsgs > I2C_RDWR_IOCTL_MAX_MSGS)
- return -EINVAL;
-
- if (get_user(datap, &udata->msgs))
- return -EFAULT;
- umsgs = compat_ptr(datap);
-
- tdata = compat_alloc_user_space(sizeof(*tdata) +
- nmsgs * sizeof(struct i2c_msg));
- tmsgs = &tdata->msgs[0];
-
- if (put_user(nmsgs, &tdata->cmd.nmsgs) ||
- put_user(tmsgs, &tdata->cmd.msgs))
- return -EFAULT;
-
- for (i = 0; i < nmsgs; i++) {
- if (copy_in_user(&tmsgs[i].addr, &umsgs[i].addr, 3*sizeof(u16)))
- return -EFAULT;
- if (get_user(datap, &umsgs[i].buf) ||
- put_user(compat_ptr(datap), &tmsgs[i].buf))
- return -EFAULT;
- }
- return do_ioctl(file, cmd, (unsigned long)tdata);
-}
-
-static int do_i2c_smbus_ioctl(struct file *file,
- unsigned int cmd, struct i2c_smbus_ioctl_data32 __user *udata)
-{
- struct i2c_smbus_ioctl_data __user *tdata;
- union {
- /* beginnings of those have identical layouts */
- struct i2c_smbus_ioctl_data32 data32;
- struct i2c_smbus_ioctl_data data;
- } v;
-
- tdata = compat_alloc_user_space(sizeof(*tdata));
- if (tdata == NULL)
- return -ENOMEM;
-
- memset(&v, 0, sizeof(v));
- if (copy_from_user(&v.data32, udata, sizeof(v.data32)))
- return -EFAULT;
- v.data.data = compat_ptr(v.data32.data);
-
- if (copy_to_user(tdata, &v.data, sizeof(v.data)))
- return -EFAULT;
-
- return do_ioctl(file, cmd, (unsigned long)tdata);
-}
-
#define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t)
#define RTC_IRQP_SET32 _IOW('p', 0x0c, compat_ulong_t)
#define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t)
@@ -1283,13 +1175,6 @@ COMPATIBLE_IOCTL(PCIIOC_CONTROLLER)
COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO)
COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM)
COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE)
-/* i2c */
-COMPATIBLE_IOCTL(I2C_SLAVE)
-COMPATIBLE_IOCTL(I2C_SLAVE_FORCE)
-COMPATIBLE_IOCTL(I2C_TENBIT)
-COMPATIBLE_IOCTL(I2C_PEC)
-COMPATIBLE_IOCTL(I2C_RETRIES)
-COMPATIBLE_IOCTL(I2C_TIMEOUT)
/* hiddev */
COMPATIBLE_IOCTL(HIDIOCGVERSION)
COMPATIBLE_IOCTL(HIDIOCAPPLICATION)
@@ -1464,13 +1349,6 @@ static long do_ioctl_trans(unsigned int cmd,
case TIOCGSERIAL:
case TIOCSSERIAL:
return serial_struct_ioctl(file, cmd, argp);
- /* i2c */
- case I2C_FUNCS:
- return w_long(file, cmd, argp);
- case I2C_RDWR:
- return do_i2c_rdwr_ioctl(file, cmd, argp);
- case I2C_SMBUS:
- return do_i2c_smbus_ioctl(file, cmd, argp);
/* Not implemented in the native kernel */
case RTC_IRQP_READ32:
case RTC_IRQP_SET32:
@@ -1580,6 +1458,7 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
case FICLONE:
case FICLONERANGE:
case FIDEDUPERANGE:
+ case FS_IOC_FIEMAP:
goto do_ioctl;
case FIBMAP:
diff --git a/fs/coredump.c b/fs/coredump.c
index 52c63d6c9143..1e2c87acac9b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -680,16 +680,11 @@ void do_coredump(const siginfo_t *siginfo)
* privs and don't want to unlink another user's coredump.
*/
if (!need_suid_safe) {
- mm_segment_t old_fs;
-
- old_fs = get_fs();
- set_fs(KERNEL_DS);
/*
* If it doesn't exist, that's fine. If there's some
* other problem, we'll catch it at the filp_open().
*/
- (void) sys_unlink((const char __user *)cn.corename);
- set_fs(old_fs);
+ do_unlinkat(AT_FDCWD, getname_kernel(cn.corename));
}
/*
diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig
index 11b29d491b7c..f937082f3244 100644
--- a/fs/cramfs/Kconfig
+++ b/fs/cramfs/Kconfig
@@ -1,6 +1,5 @@
config CRAMFS
- tristate "Compressed ROM file system support (cramfs) (OBSOLETE)"
- depends on BLOCK
+ tristate "Compressed ROM file system support (cramfs)"
select ZLIB_INFLATE
help
Saying Y here includes support for CramFs (Compressed ROM File
@@ -16,7 +15,39 @@ config CRAMFS
cramfs. Note that the root file system (the one containing the
directory /) cannot be compiled as a module.
- This filesystem is obsoleted by SquashFS, which is much better
- in terms of performance and features.
+ This filesystem is limited in capabilities and performance on
+ purpose to remain small and low on RAM usage. It is most suitable
+ for small embedded systems. If you have ample RAM to spare, you may
+ consider a more capable compressed filesystem such as SquashFS
+ which is much better in terms of performance and features.
+
+ If unsure, say N.
+
+config CRAMFS_BLOCKDEV
+ bool "Support CramFs image over a regular block device" if EXPERT
+ depends on CRAMFS && BLOCK
+ default y
+ help
+ This option allows the CramFs driver to load data from a regular
+ block device such a disk partition or a ramdisk.
+
+config CRAMFS_MTD
+ bool "Support CramFs image directly mapped in physical memory"
+ depends on CRAMFS && MTD
+ default y if !CRAMFS_BLOCKDEV
+ help
+ This option allows the CramFs driver to load data directly from
+ a linear adressed memory range (usually non volatile memory
+ like flash) instead of going through the block device layer.
+ This saves some memory since no intermediate buffering is
+ necessary.
+
+ The location of the CramFs image is determined by a
+ MTD device capable of direct memory mapping e.g. from
+ the 'physmap' map driver or a resulting MTD partition.
+ For example, this would mount the cramfs image stored in
+ the MTD partition named "xip_fs" on the /mnt mountpoint:
+
+ mount -t cramfs mtd:xip_fs /mnt
If unsure, say N.
diff --git a/fs/cramfs/README b/fs/cramfs/README
index 9d4e7ea311f4..d71b27e0ff15 100644
--- a/fs/cramfs/README
+++ b/fs/cramfs/README
@@ -49,17 +49,46 @@ same as the start of the (i+1)'th <block> if there is one). The first
<block> immediately follows the last <block_pointer> for the file.
<block_pointer>s are each 32 bits long.
+When the CRAMFS_FLAG_EXT_BLOCK_POINTERS capability bit is set, each
+<block_pointer>'s top bits may contain special flags as follows:
+
+CRAMFS_BLK_FLAG_UNCOMPRESSED (bit 31):
+ The block data is not compressed and should be copied verbatim.
+
+CRAMFS_BLK_FLAG_DIRECT_PTR (bit 30):
+ The <block_pointer> stores the actual block start offset and not
+ its end, shifted right by 2 bits. The block must therefore be
+ aligned to a 4-byte boundary. The block size is either blksize
+ if CRAMFS_BLK_FLAG_UNCOMPRESSED is also specified, otherwise
+ the compressed data length is included in the first 2 bytes of
+ the block data. This is used to allow discontiguous data layout
+ and specific data block alignments e.g. for XIP applications.
+
+
The order of <file_data>'s is a depth-first descent of the directory
tree, i.e. the same order as `find -size +0 \( -type f -o -type l \)
-print'.
<block>: The i'th <block> is the output of zlib's compress function
-applied to the i'th blksize-sized chunk of the input data.
+applied to the i'th blksize-sized chunk of the input data if the
+corresponding CRAMFS_BLK_FLAG_UNCOMPRESSED <block_ptr> bit is not set,
+otherwise it is the input data directly.
(For the last <block> of the file, the input may of course be smaller.)
Each <block> may be a different size. (See <block_pointer> above.)
+
<block>s are merely byte-aligned, not generally u32-aligned.
+When CRAMFS_BLK_FLAG_DIRECT_PTR is specified then the corresponding
+<block> may be located anywhere and not necessarily contiguous with
+the previous/next blocks. In that case it is minimally u32-aligned.
+If CRAMFS_BLK_FLAG_UNCOMPRESSED is also specified then the size is always
+blksize except for the last block which is limited by the file length.
+If CRAMFS_BLK_FLAG_DIRECT_PTR is set and CRAMFS_BLK_FLAG_UNCOMPRESSED
+is not set then the first 2 bytes of the block contains the size of the
+remaining block data as this cannot be determined from the placement of
+logically adjacent blocks.
+
Holes
-----
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 7919967488cb..9a2ab419ba62 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -15,10 +15,15 @@
#include <linux/module.h>
#include <linux/fs.h>
+#include <linux/file.h>
#include <linux/pagemap.h>
+#include <linux/pfn_t.h>
+#include <linux/ramfs.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/blkdev.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/super.h>
#include <linux/slab.h>
#include <linux/vfs.h>
#include <linux/mutex.h>
@@ -36,6 +41,9 @@ struct cramfs_sb_info {
unsigned long blocks;
unsigned long files;
unsigned long flags;
+ void *linear_virt_addr;
+ resource_size_t linear_phys_addr;
+ size_t mtd_point_size;
};
static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb)
@@ -46,6 +54,7 @@ static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb)
static const struct super_operations cramfs_ops;
static const struct inode_operations cramfs_dir_inode_operations;
static const struct file_operations cramfs_directory_operations;
+static const struct file_operations cramfs_physmem_fops;
static const struct address_space_operations cramfs_aops;
static DEFINE_MUTEX(read_mutex);
@@ -93,6 +102,10 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
case S_IFREG:
inode->i_fop = &generic_ro_fops;
inode->i_data.a_ops = &cramfs_aops;
+ if (IS_ENABLED(CONFIG_CRAMFS_MTD) &&
+ CRAMFS_SB(sb)->flags & CRAMFS_FLAG_EXT_BLOCK_POINTERS &&
+ CRAMFS_SB(sb)->linear_phys_addr)
+ inode->i_fop = &cramfs_physmem_fops;
break;
case S_IFDIR:
inode->i_op = &cramfs_dir_inode_operations;
@@ -140,6 +153,9 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
* BLKS_PER_BUF*PAGE_SIZE, so that the caller doesn't need to
* worry about end-of-buffer issues even when decompressing a full
* page cache.
+ *
+ * Note: This is all optimized away at compile time when
+ * CONFIG_CRAMFS_BLOCKDEV=n.
*/
#define READ_BUFFERS (2)
/* NEXT_BUFFER(): Loop over [0..(READ_BUFFERS-1)]. */
@@ -160,10 +176,10 @@ static struct super_block *buffer_dev[READ_BUFFERS];
static int next_buffer;
/*
- * Returns a pointer to a buffer containing at least LEN bytes of
- * filesystem starting at byte offset OFFSET into the filesystem.
+ * Populate our block cache and return a pointer to it.
*/
-static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned int len)
+static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
+ unsigned int len)
{
struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
struct page *pages[BLKS_PER_BUF];
@@ -239,11 +255,250 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
return read_buffers[buffer] + offset;
}
+/*
+ * Return a pointer to the linearly addressed cramfs image in memory.
+ */
+static void *cramfs_direct_read(struct super_block *sb, unsigned int offset,
+ unsigned int len)
+{
+ struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
+
+ if (!len)
+ return NULL;
+ if (len > sbi->size || offset > sbi->size - len)
+ return page_address(ZERO_PAGE(0));
+ return sbi->linear_virt_addr + offset;
+}
+
+/*
+ * Returns a pointer to a buffer containing at least LEN bytes of
+ * filesystem starting at byte offset OFFSET into the filesystem.
+ */
+static void *cramfs_read(struct super_block *sb, unsigned int offset,
+ unsigned int len)
+{
+ struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
+
+ if (IS_ENABLED(CONFIG_CRAMFS_MTD) && sbi->linear_virt_addr)
+ return cramfs_direct_read(sb, offset, len);
+ else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV))
+ return cramfs_blkdev_read(sb, offset, len);
+ else
+ return NULL;
+}
+
+/*
+ * For a mapping to be possible, we need a range of uncompressed and
+ * contiguous blocks. Return the offset for the first block and number of
+ * valid blocks for which that is true, or zero otherwise.
+ */
+static u32 cramfs_get_block_range(struct inode *inode, u32 pgoff, u32 *pages)
+{
+ struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb);
+ int i;
+ u32 *blockptrs, first_block_addr;
+
+ /*
+ * We can dereference memory directly here as this code may be
+ * reached only when there is a direct filesystem image mapping
+ * available in memory.
+ */
+ blockptrs = (u32 *)(sbi->linear_virt_addr + OFFSET(inode) + pgoff * 4);
+ first_block_addr = blockptrs[0] & ~CRAMFS_BLK_FLAGS;
+ i = 0;
+ do {
+ u32 block_off = i * (PAGE_SIZE >> CRAMFS_BLK_DIRECT_PTR_SHIFT);
+ u32 expect = (first_block_addr + block_off) |
+ CRAMFS_BLK_FLAG_DIRECT_PTR |
+ CRAMFS_BLK_FLAG_UNCOMPRESSED;
+ if (blockptrs[i] != expect) {
+ pr_debug("range: block %d/%d got %#x expects %#x\n",
+ pgoff+i, pgoff + *pages - 1,
+ blockptrs[i], expect);
+ if (i == 0)
+ return 0;
+ break;
+ }
+ } while (++i < *pages);
+
+ *pages = i;
+ return first_block_addr << CRAMFS_BLK_DIRECT_PTR_SHIFT;
+}
+
+#ifdef CONFIG_MMU
+
+/*
+ * Return true if the last page of a file in the filesystem image contains
+ * some other data that doesn't belong to that file. It is assumed that the
+ * last block is CRAMFS_BLK_FLAG_DIRECT_PTR | CRAMFS_BLK_FLAG_UNCOMPRESSED
+ * (verified by cramfs_get_block_range() and directly accessible in memory.
+ */
+static bool cramfs_last_page_is_shared(struct inode *inode)
+{
+ struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb);
+ u32 partial, last_page, blockaddr, *blockptrs;
+ char *tail_data;
+
+ partial = offset_in_page(inode->i_size);
+ if (!partial)
+ return false;
+ last_page = inode->i_size >> PAGE_SHIFT;
+ blockptrs = (u32 *)(sbi->linear_virt_addr + OFFSET(inode));
+ blockaddr = blockptrs[last_page] & ~CRAMFS_BLK_FLAGS;
+ blockaddr <<= CRAMFS_BLK_DIRECT_PTR_SHIFT;
+ tail_data = sbi->linear_virt_addr + blockaddr + partial;
+ return memchr_inv(tail_data, 0, PAGE_SIZE - partial) ? true : false;
+}
+
+static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct inode *inode = file_inode(file);
+ struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb);
+ unsigned int pages, max_pages, offset;
+ unsigned long address, pgoff = vma->vm_pgoff;
+ char *bailout_reason;
+ int ret;
+
+ ret = generic_file_readonly_mmap(file, vma);
+ if (ret)
+ return ret;
+
+ /*
+ * Now try to pre-populate ptes for this vma with a direct
+ * mapping avoiding memory allocation when possible.
+ */
+
+ /* Could COW work here? */
+ bailout_reason = "vma is writable";
+ if (vma->vm_flags & VM_WRITE)
+ goto bailout;
+
+ max_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ bailout_reason = "beyond file limit";
+ if (pgoff >= max_pages)
+ goto bailout;
+ pages = min(vma_pages(vma), max_pages - pgoff);
+
+ offset = cramfs_get_block_range(inode, pgoff, &pages);
+ bailout_reason = "unsuitable block layout";
+ if (!offset)
+ goto bailout;
+ address = sbi->linear_phys_addr + offset;
+ bailout_reason = "data is not page aligned";
+ if (!PAGE_ALIGNED(address))
+ goto bailout;
+
+ /* Don't map the last page if it contains some other data */
+ if (pgoff + pages == max_pages && cramfs_last_page_is_shared(inode)) {
+ pr_debug("mmap: %s: last page is shared\n",
+ file_dentry(file)->d_name.name);
+ pages--;
+ }
+
+ if (!pages) {
+ bailout_reason = "no suitable block remaining";
+ goto bailout;
+ }
+
+ if (pages == vma_pages(vma)) {
+ /*
+ * The entire vma is mappable. remap_pfn_range() will
+ * make it distinguishable from a non-direct mapping
+ * in /proc/<pid>/maps by substituting the file offset
+ * with the actual physical address.
+ */
+ ret = remap_pfn_range(vma, vma->vm_start, address >> PAGE_SHIFT,
+ pages * PAGE_SIZE, vma->vm_page_prot);
+ } else {
+ /*
+ * Let's create a mixed map if we can't map it all.
+ * The normal paging machinery will take care of the
+ * unpopulated ptes via cramfs_readpage().
+ */
+ int i;
+ vma->vm_flags |= VM_MIXEDMAP;
+ for (i = 0; i < pages && !ret; i++) {
+ unsigned long off = i * PAGE_SIZE;
+ pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV);
+ ret = vm_insert_mixed(vma, vma->vm_start + off, pfn);
+ }
+ }
+
+ if (!ret)
+ pr_debug("mapped %s[%lu] at 0x%08lx (%u/%lu pages) "
+ "to vma 0x%08lx, page_prot 0x%llx\n",
+ file_dentry(file)->d_name.name, pgoff,
+ address, pages, vma_pages(vma), vma->vm_start,
+ (unsigned long long)pgprot_val(vma->vm_page_prot));
+ return ret;
+
+bailout:
+ pr_debug("%s[%lu]: direct mmap impossible: %s\n",
+ file_dentry(file)->d_name.name, pgoff, bailout_reason);
+ /* Didn't manage any direct map, but normal paging is still possible */
+ return 0;
+}
+
+#else /* CONFIG_MMU */
+
+static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
+}
+
+static unsigned long cramfs_physmem_get_unmapped_area(struct file *file,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ struct inode *inode = file_inode(file);
+ struct super_block *sb = inode->i_sb;
+ struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
+ unsigned int pages, block_pages, max_pages, offset;
+
+ pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ max_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (pgoff >= max_pages || pages > max_pages - pgoff)
+ return -EINVAL;
+ block_pages = pages;
+ offset = cramfs_get_block_range(inode, pgoff, &block_pages);
+ if (!offset || block_pages != pages)
+ return -ENOSYS;
+ addr = sbi->linear_phys_addr + offset;
+ pr_debug("get_unmapped for %s ofs %#lx siz %lu at 0x%08lx\n",
+ file_dentry(file)->d_name.name, pgoff*PAGE_SIZE, len, addr);
+ return addr;
+}
+
+static unsigned int cramfs_physmem_mmap_capabilities(struct file *file)
+{
+ return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT |
+ NOMMU_MAP_READ | NOMMU_MAP_EXEC;
+}
+
+#endif /* CONFIG_MMU */
+
+static const struct file_operations cramfs_physmem_fops = {
+ .llseek = generic_file_llseek,
+ .read_iter = generic_file_read_iter,
+ .splice_read = generic_file_splice_read,
+ .mmap = cramfs_physmem_mmap,
+#ifndef CONFIG_MMU
+ .get_unmapped_area = cramfs_physmem_get_unmapped_area,
+ .mmap_capabilities = cramfs_physmem_mmap_capabilities,
+#endif
+};
+
static void cramfs_kill_sb(struct super_block *sb)
{
struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
- kill_block_super(sb);
+ if (IS_ENABLED(CCONFIG_CRAMFS_MTD) && sb->s_mtd) {
+ if (sbi && sbi->mtd_point_size)
+ mtd_unpoint(sb->s_mtd, 0, sbi->mtd_point_size);
+ kill_mtd_super(sb);
+ } else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) {
+ kill_block_super(sb);
+ }
kfree(sbi);
}
@@ -254,34 +509,24 @@ static int cramfs_remount(struct super_block *sb, int *flags, char *data)
return 0;
}
-static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
+static int cramfs_read_super(struct super_block *sb,
+ struct cramfs_super *super, int silent)
{
- int i;
- struct cramfs_super super;
+ struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
unsigned long root_offset;
- struct cramfs_sb_info *sbi;
- struct inode *root;
- sb->s_flags |= MS_RDONLY;
-
- sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL);
- if (!sbi)
- return -ENOMEM;
- sb->s_fs_info = sbi;
-
- /* Invalidate the read buffers on mount: think disk change.. */
- mutex_lock(&read_mutex);
- for (i = 0; i < READ_BUFFERS; i++)
- buffer_blocknr[i] = -1;
+ /* We don't know the real size yet */
+ sbi->size = PAGE_SIZE;
/* Read the first block and get the superblock from it */
- memcpy(&super, cramfs_read(sb, 0, sizeof(super)), sizeof(super));
+ mutex_lock(&read_mutex);
+ memcpy(super, cramfs_read(sb, 0, sizeof(*super)), sizeof(*super));
mutex_unlock(&read_mutex);
/* Do sanity checks on the superblock */
- if (super.magic != CRAMFS_MAGIC) {
+ if (super->magic != CRAMFS_MAGIC) {
/* check for wrong endianness */
- if (super.magic == CRAMFS_MAGIC_WEND) {
+ if (super->magic == CRAMFS_MAGIC_WEND) {
if (!silent)
pr_err("wrong endianness\n");
return -EINVAL;
@@ -289,10 +534,12 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
/* check at 512 byte offset */
mutex_lock(&read_mutex);
- memcpy(&super, cramfs_read(sb, 512, sizeof(super)), sizeof(super));
+ memcpy(super,
+ cramfs_read(sb, 512, sizeof(*super)),
+ sizeof(*super));
mutex_unlock(&read_mutex);
- if (super.magic != CRAMFS_MAGIC) {
- if (super.magic == CRAMFS_MAGIC_WEND && !silent)
+ if (super->magic != CRAMFS_MAGIC) {
+ if (super->magic == CRAMFS_MAGIC_WEND && !silent)
pr_err("wrong endianness\n");
else if (!silent)
pr_err("wrong magic\n");
@@ -301,34 +548,34 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
}
/* get feature flags first */
- if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) {
+ if (super->flags & ~CRAMFS_SUPPORTED_FLAGS) {
pr_err("unsupported filesystem features\n");
return -EINVAL;
}
/* Check that the root inode is in a sane state */
- if (!S_ISDIR(super.root.mode)) {
+ if (!S_ISDIR(super->root.mode)) {
pr_err("root is not a directory\n");
return -EINVAL;
}
/* correct strange, hard-coded permissions of mkcramfs */
- super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+ super->root.mode |= 0555;
- root_offset = super.root.offset << 2;
- if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) {
- sbi->size = super.size;
- sbi->blocks = super.fsid.blocks;
- sbi->files = super.fsid.files;
+ root_offset = super->root.offset << 2;
+ if (super->flags & CRAMFS_FLAG_FSID_VERSION_2) {
+ sbi->size = super->size;
+ sbi->blocks = super->fsid.blocks;
+ sbi->files = super->fsid.files;
} else {
sbi->size = 1<<28;
sbi->blocks = 0;
sbi->files = 0;
}
- sbi->magic = super.magic;
- sbi->flags = super.flags;
+ sbi->magic = super->magic;
+ sbi->flags = super->flags;
if (root_offset == 0)
pr_info("empty filesystem");
- else if (!(super.flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) &&
+ else if (!(super->flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) &&
((root_offset != sizeof(struct cramfs_super)) &&
(root_offset != 512 + sizeof(struct cramfs_super))))
{
@@ -336,9 +583,18 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
return -EINVAL;
}
+ return 0;
+}
+
+static int cramfs_finalize_super(struct super_block *sb,
+ struct cramfs_inode *cramfs_root)
+{
+ struct inode *root;
+
/* Set it all up.. */
+ sb->s_flags |= MS_RDONLY;
sb->s_op = &cramfs_ops;
- root = get_cramfs_inode(sb, &super.root, 0);
+ root = get_cramfs_inode(sb, cramfs_root, 0);
if (IS_ERR(root))
return PTR_ERR(root);
sb->s_root = d_make_root(root);
@@ -347,10 +603,79 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
return 0;
}
+static int cramfs_blkdev_fill_super(struct super_block *sb, void *data,
+ int silent)
+{
+ struct cramfs_sb_info *sbi;
+ struct cramfs_super super;
+ int i, err;
+
+ sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL);
+ if (!sbi)
+ return -ENOMEM;
+ sb->s_fs_info = sbi;
+
+ /* Invalidate the read buffers on mount: think disk change.. */
+ for (i = 0; i < READ_BUFFERS; i++)
+ buffer_blocknr[i] = -1;
+
+ err = cramfs_read_super(sb, &super, silent);
+ if (err)
+ return err;
+ return cramfs_finalize_super(sb, &super.root);
+}
+
+static int cramfs_mtd_fill_super(struct super_block *sb, void *data,
+ int silent)
+{
+ struct cramfs_sb_info *sbi;
+ struct cramfs_super super;
+ int err;
+
+ sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL);
+ if (!sbi)
+ return -ENOMEM;
+ sb->s_fs_info = sbi;
+
+ /* Map only one page for now. Will remap it when fs size is known. */
+ err = mtd_point(sb->s_mtd, 0, PAGE_SIZE, &sbi->mtd_point_size,
+ &sbi->linear_virt_addr, &sbi->linear_phys_addr);
+ if (err || sbi->mtd_point_size != PAGE_SIZE) {
+ pr_err("unable to get direct memory access to mtd:%s\n",
+ sb->s_mtd->name);
+ return err ? : -ENODATA;
+ }
+
+ pr_info("checking physical address %pap for linear cramfs image\n",
+ &sbi->linear_phys_addr);
+ err = cramfs_read_super(sb, &super, silent);
+ if (err)
+ return err;
+
+ /* Remap the whole filesystem now */
+ pr_info("linear cramfs image on mtd:%s appears to be %lu KB in size\n",
+ sb->s_mtd->name, sbi->size/1024);
+ mtd_unpoint(sb->s_mtd, 0, PAGE_SIZE);
+ err = mtd_point(sb->s_mtd, 0, sbi->size, &sbi->mtd_point_size,
+ &sbi->linear_virt_addr, &sbi->linear_phys_addr);
+ if (err || sbi->mtd_point_size != sbi->size) {
+ pr_err("unable to get direct memory access to mtd:%s\n",
+ sb->s_mtd->name);
+ return err ? : -ENODATA;
+ }
+
+ return cramfs_finalize_super(sb, &super.root);
+}
+
static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
- u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+ u64 id = 0;
+
+ if (sb->s_bdev)
+ id = huge_encode_dev(sb->s_bdev->bd_dev);
+ else if (sb->s_dev)
+ id = huge_encode_dev(sb->s_dev);
buf->f_type = CRAMFS_MAGIC;
buf->f_bsize = PAGE_SIZE;
@@ -502,34 +827,86 @@ static int cramfs_readpage(struct file *file, struct page *page)
if (page->index < maxblock) {
struct super_block *sb = inode->i_sb;
- u32 blkptr_offset = OFFSET(inode) + page->index*4;
- u32 start_offset, compr_len;
+ u32 blkptr_offset = OFFSET(inode) + page->index * 4;
+ u32 block_ptr, block_start, block_len;
+ bool uncompressed, direct;
- start_offset = OFFSET(inode) + maxblock*4;
mutex_lock(&read_mutex);
- if (page->index)
- start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4,
- 4);
- compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) -
- start_offset);
- mutex_unlock(&read_mutex);
+ block_ptr = *(u32 *) cramfs_read(sb, blkptr_offset, 4);
+ uncompressed = (block_ptr & CRAMFS_BLK_FLAG_UNCOMPRESSED);
+ direct = (block_ptr & CRAMFS_BLK_FLAG_DIRECT_PTR);
+ block_ptr &= ~CRAMFS_BLK_FLAGS;
+
+ if (direct) {
+ /*
+ * The block pointer is an absolute start pointer,
+ * shifted by 2 bits. The size is included in the
+ * first 2 bytes of the data block when compressed,
+ * or PAGE_SIZE otherwise.
+ */
+ block_start = block_ptr << CRAMFS_BLK_DIRECT_PTR_SHIFT;
+ if (uncompressed) {
+ block_len = PAGE_SIZE;
+ /* if last block: cap to file length */
+ if (page->index == maxblock - 1)
+ block_len =
+ offset_in_page(inode->i_size);
+ } else {
+ block_len = *(u16 *)
+ cramfs_read(sb, block_start, 2);
+ block_start += 2;
+ }
+ } else {
+ /*
+ * The block pointer indicates one past the end of
+ * the current block (start of next block). If this
+ * is the first block then it starts where the block
+ * pointer table ends, otherwise its start comes
+ * from the previous block's pointer.
+ */
+ block_start = OFFSET(inode) + maxblock * 4;
+ if (page->index)
+ block_start = *(u32 *)
+ cramfs_read(sb, blkptr_offset - 4, 4);
+ /* Beware... previous ptr might be a direct ptr */
+ if (unlikely(block_start & CRAMFS_BLK_FLAG_DIRECT_PTR)) {
+ /* See comments on earlier code. */
+ u32 prev_start = block_start;
+ block_start = prev_start & ~CRAMFS_BLK_FLAGS;
+ block_start <<= CRAMFS_BLK_DIRECT_PTR_SHIFT;
+ if (prev_start & CRAMFS_BLK_FLAG_UNCOMPRESSED) {
+ block_start += PAGE_SIZE;
+ } else {
+ block_len = *(u16 *)
+ cramfs_read(sb, block_start, 2);
+ block_start += 2 + block_len;
+ }
+ }
+ block_start &= ~CRAMFS_BLK_FLAGS;
+ block_len = block_ptr - block_start;
+ }
- if (compr_len == 0)
+ if (block_len == 0)
; /* hole */
- else if (unlikely(compr_len > (PAGE_SIZE << 1))) {
- pr_err("bad compressed blocksize %u\n",
- compr_len);
+ else if (unlikely(block_len > 2*PAGE_SIZE ||
+ (uncompressed && block_len > PAGE_SIZE))) {
+ mutex_unlock(&read_mutex);
+ pr_err("bad data blocksize %u\n", block_len);
goto err;
+ } else if (uncompressed) {
+ memcpy(pgdata,
+ cramfs_read(sb, block_start, block_len),
+ block_len);
+ bytes_filled = block_len;
} else {
- mutex_lock(&read_mutex);
bytes_filled = cramfs_uncompress_block(pgdata,
PAGE_SIZE,
- cramfs_read(sb, start_offset, compr_len),
- compr_len);
- mutex_unlock(&read_mutex);
- if (unlikely(bytes_filled < 0))
- goto err;
+ cramfs_read(sb, block_start, block_len),
+ block_len);
}
+ mutex_unlock(&read_mutex);
+ if (unlikely(bytes_filled < 0))
+ goto err;
}
memset(pgdata + bytes_filled, 0, PAGE_SIZE - bytes_filled);
@@ -573,10 +950,22 @@ static const struct super_operations cramfs_ops = {
.statfs = cramfs_statfs,
};
-static struct dentry *cramfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static struct dentry *cramfs_mount(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data)
{
- return mount_bdev(fs_type, flags, dev_name, data, cramfs_fill_super);
+ struct dentry *ret = ERR_PTR(-ENOPROTOOPT);
+
+ if (IS_ENABLED(CONFIG_CRAMFS_MTD)) {
+ ret = mount_mtd(fs_type, flags, dev_name, data,
+ cramfs_mtd_fill_super);
+ if (!IS_ERR(ret))
+ return ret;
+ }
+ if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV)) {
+ ret = mount_bdev(fs_type, flags, dev_name, data,
+ cramfs_blkdev_fill_super);
+ }
+ return ret;
}
static struct file_system_type cramfs_fs_type = {
diff --git a/fs/dax.c b/fs/dax.c
index 3652b26a0048..95981591977a 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -526,13 +526,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
static void *dax_insert_mapping_entry(struct address_space *mapping,
struct vm_fault *vmf,
void *entry, sector_t sector,
- unsigned long flags)
+ unsigned long flags, bool dirty)
{
struct radix_tree_root *page_tree = &mapping->page_tree;
void *new_entry;
pgoff_t index = vmf->pgoff;
- if (vmf->flags & FAULT_FLAG_WRITE)
+ if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
@@ -569,7 +569,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
entry = new_entry;
}
- if (vmf->flags & FAULT_FLAG_WRITE)
+ if (dirty)
radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&mapping->tree_lock);
@@ -825,38 +825,42 @@ out:
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
-static int dax_insert_mapping(struct address_space *mapping,
- struct block_device *bdev, struct dax_device *dax_dev,
- sector_t sector, size_t size, void *entry,
- struct vm_area_struct *vma, struct vm_fault *vmf)
+static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
{
- unsigned long vaddr = vmf->address;
- void *ret, *kaddr;
+ return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
+}
+
+static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
+ pfn_t *pfnp)
+{
+ const sector_t sector = dax_iomap_sector(iomap, pos);
pgoff_t pgoff;
+ void *kaddr;
int id, rc;
- pfn_t pfn;
+ long length;
- rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+ rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
if (rc)
return rc;
-
id = dax_read_lock();
- rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
- if (rc < 0) {
- dax_read_unlock(id);
- return rc;
+ length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
+ &kaddr, pfnp);
+ if (length < 0) {
+ rc = length;
+ goto out;
}
+ rc = -EINVAL;
+ if (PFN_PHYS(length) < size)
+ goto out;
+ if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
+ goto out;
+ /* For larger pages we need devmap */
+ if (length > 1 && !pfn_t_devmap(*pfnp))
+ goto out;
+ rc = 0;
+out:
dax_read_unlock(id);
-
- ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
- if (IS_ERR(ret))
- return PTR_ERR(ret);
-
- trace_dax_insert_mapping(mapping->host, vmf, ret);
- if (vmf->flags & FAULT_FLAG_WRITE)
- return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
- else
- return vm_insert_mixed(vma, vaddr, pfn);
+ return rc;
}
/*
@@ -882,7 +886,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
}
entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
- RADIX_DAX_ZERO_PAGE);
+ RADIX_DAX_ZERO_PAGE, false);
if (IS_ERR(entry2)) {
ret = VM_FAULT_SIGBUS;
goto out;
@@ -941,11 +945,6 @@ int __dax_zero_page_range(struct block_device *bdev,
}
EXPORT_SYMBOL_GPL(__dax_zero_page_range);
-static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
-{
- return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
-}
-
static loff_t
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iomap *iomap)
@@ -1085,19 +1084,33 @@ static int dax_fault_return(int error)
return VM_FAULT_SIGBUS;
}
-static int dax_iomap_pte_fault(struct vm_fault *vmf,
+/*
+ * MAP_SYNC on a dax mapping guarantees dirty metadata is
+ * flushed on write-faults (non-cow), but not read-faults.
+ */
+static bool dax_fault_is_synchronous(unsigned long flags,
+ struct vm_area_struct *vma, struct iomap *iomap)
+{
+ return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
+ && (iomap->flags & IOMAP_F_DIRTY);
+}
+
+static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
{
- struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ struct vm_area_struct *vma = vmf->vma;
+ struct address_space *mapping = vma->vm_file->f_mapping;
struct inode *inode = mapping->host;
unsigned long vaddr = vmf->address;
loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
- sector_t sector;
struct iomap iomap = { 0 };
unsigned flags = IOMAP_FAULT;
int error, major = 0;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+ bool sync;
int vmf_ret = 0;
void *entry;
+ pfn_t pfn;
trace_dax_pte_fault(inode, vmf, vmf_ret);
/*
@@ -1110,7 +1123,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
goto out;
}
- if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
+ if (write && !vmf->cow_page)
flags |= IOMAP_WRITE;
entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
@@ -1145,9 +1158,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
goto error_finish_iomap;
}
- sector = dax_iomap_sector(&iomap, pos);
-
if (vmf->cow_page) {
+ sector_t sector = dax_iomap_sector(&iomap, pos);
+
switch (iomap.type) {
case IOMAP_HOLE:
case IOMAP_UNWRITTEN:
@@ -1173,22 +1186,55 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
goto finish_iomap;
}
+ sync = dax_fault_is_synchronous(flags, vma, &iomap);
+
switch (iomap.type) {
case IOMAP_MAPPED:
if (iomap.flags & IOMAP_F_NEW) {
count_vm_event(PGMAJFAULT);
- count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
+ count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
major = VM_FAULT_MAJOR;
}
- error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
- sector, PAGE_SIZE, entry, vmf->vma, vmf);
+ error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
+ if (error < 0)
+ goto error_finish_iomap;
+
+ entry = dax_insert_mapping_entry(mapping, vmf, entry,
+ dax_iomap_sector(&iomap, pos),
+ 0, write && !sync);
+ if (IS_ERR(entry)) {
+ error = PTR_ERR(entry);
+ goto error_finish_iomap;
+ }
+
+ /*
+ * If we are doing synchronous page fault and inode needs fsync,
+ * we can insert PTE into page tables only after that happens.
+ * Skip insertion for now and return the pfn so that caller can
+ * insert it after fsync is done.
+ */
+ if (sync) {
+ if (WARN_ON_ONCE(!pfnp)) {
+ error = -EIO;
+ goto error_finish_iomap;
+ }
+ *pfnp = pfn;
+ vmf_ret = VM_FAULT_NEEDDSYNC | major;
+ goto finish_iomap;
+ }
+ trace_dax_insert_mapping(inode, vmf, entry);
+ if (write)
+ error = vm_insert_mixed_mkwrite(vma, vaddr, pfn);
+ else
+ error = vm_insert_mixed(vma, vaddr, pfn);
+
/* -EBUSY is fine, somebody else faulted on the same PTE */
if (error == -EBUSY)
error = 0;
break;
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
- if (!(vmf->flags & FAULT_FLAG_WRITE)) {
+ if (!write) {
vmf_ret = dax_load_hole(mapping, entry, vmf);
goto finish_iomap;
}
@@ -1223,53 +1269,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
}
#ifdef CONFIG_FS_DAX_PMD
-static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
- loff_t pos, void *entry)
-{
- struct address_space *mapping = vmf->vma->vm_file->f_mapping;
- const sector_t sector = dax_iomap_sector(iomap, pos);
- struct dax_device *dax_dev = iomap->dax_dev;
- struct block_device *bdev = iomap->bdev;
- struct inode *inode = mapping->host;
- const size_t size = PMD_SIZE;
- void *ret = NULL, *kaddr;
- long length = 0;
- pgoff_t pgoff;
- pfn_t pfn = {};
- int id;
-
- if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
- goto fallback;
-
- id = dax_read_lock();
- length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
- if (length < 0)
- goto unlock_fallback;
- length = PFN_PHYS(length);
-
- if (length < size)
- goto unlock_fallback;
- if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR)
- goto unlock_fallback;
- if (!pfn_t_devmap(pfn))
- goto unlock_fallback;
- dax_read_unlock(id);
-
- ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
- RADIX_DAX_PMD);
- if (IS_ERR(ret))
- goto fallback;
-
- trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
- return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
- pfn, vmf->flags & FAULT_FLAG_WRITE);
-
-unlock_fallback:
- dax_read_unlock(id);
-fallback:
- trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
- return VM_FAULT_FALLBACK;
-}
+/*
+ * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
+ * more often than one might expect in the below functions.
+ */
+#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
void *entry)
@@ -1288,7 +1292,7 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
goto fallback;
ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
- RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
+ RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
if (IS_ERR(ret))
goto fallback;
@@ -1310,13 +1314,14 @@ fallback:
return VM_FAULT_FALLBACK;
}
-static int dax_iomap_pmd_fault(struct vm_fault *vmf,
+static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
{
struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping = vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK;
bool write = vmf->flags & FAULT_FLAG_WRITE;
+ bool sync;
unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
struct inode *inode = mapping->host;
int result = VM_FAULT_FALLBACK;
@@ -1325,6 +1330,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
void *entry;
loff_t pos;
int error;
+ pfn_t pfn;
/*
* Check whether offset isn't beyond end of file now. Caller is
@@ -1332,7 +1338,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
* this is a reliable test.
*/
pgoff = linear_page_index(vma, pmd_addr);
- max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+ max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
@@ -1356,13 +1362,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
if ((pmd_addr + PMD_SIZE) > vma->vm_end)
goto fallback;
- if (pgoff > max_pgoff) {
+ if (pgoff >= max_pgoff) {
result = VM_FAULT_SIGBUS;
goto out;
}
/* If the PMD would extend beyond the file size */
- if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
+ if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)
goto fallback;
/*
@@ -1400,9 +1406,37 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
if (iomap.offset + iomap.length < pos + PMD_SIZE)
goto finish_iomap;
+ sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
+
switch (iomap.type) {
case IOMAP_MAPPED:
- result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
+ error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
+ if (error < 0)
+ goto finish_iomap;
+
+ entry = dax_insert_mapping_entry(mapping, vmf, entry,
+ dax_iomap_sector(&iomap, pos),
+ RADIX_DAX_PMD, write && !sync);
+ if (IS_ERR(entry))
+ goto finish_iomap;
+
+ /*
+ * If we are doing synchronous page fault and inode needs fsync,
+ * we can insert PMD into page tables only after that happens.
+ * Skip insertion for now and return the pfn so that caller can
+ * insert it after fsync is done.
+ */
+ if (sync) {
+ if (WARN_ON_ONCE(!pfnp))
+ goto finish_iomap;
+ *pfnp = pfn;
+ result = VM_FAULT_NEEDDSYNC;
+ goto finish_iomap;
+ }
+
+ trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
+ result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
+ write);
break;
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
@@ -1442,7 +1476,7 @@ out:
return result;
}
#else
-static int dax_iomap_pmd_fault(struct vm_fault *vmf,
+static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
{
return VM_FAULT_FALLBACK;
@@ -1452,7 +1486,9 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
/**
* dax_iomap_fault - handle a page fault on a DAX file
* @vmf: The description of the fault
- * @ops: iomap ops passed from the file system
+ * @pe_size: Size of the page to fault in
+ * @pfnp: PFN to insert for synchronous faults if fsync is required
+ * @ops: Iomap ops passed from the file system
*
* When a page fault occurs, filesystems may call this helper in
* their fault handler for DAX files. dax_iomap_fault() assumes the caller
@@ -1460,15 +1496,98 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
* successfully.
*/
int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
- const struct iomap_ops *ops)
+ pfn_t *pfnp, const struct iomap_ops *ops)
{
switch (pe_size) {
case PE_SIZE_PTE:
- return dax_iomap_pte_fault(vmf, ops);
+ return dax_iomap_pte_fault(vmf, pfnp, ops);
case PE_SIZE_PMD:
- return dax_iomap_pmd_fault(vmf, ops);
+ return dax_iomap_pmd_fault(vmf, pfnp, ops);
default:
return VM_FAULT_FALLBACK;
}
}
EXPORT_SYMBOL_GPL(dax_iomap_fault);
+
+/**
+ * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
+ * @vmf: The description of the fault
+ * @pe_size: Size of entry to be inserted
+ * @pfn: PFN to insert
+ *
+ * This function inserts writeable PTE or PMD entry into page tables for mmaped
+ * DAX file. It takes care of marking corresponding radix tree entry as dirty
+ * as well.
+ */
+static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
+ enum page_entry_size pe_size,
+ pfn_t pfn)
+{
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ void *entry, **slot;
+ pgoff_t index = vmf->pgoff;
+ int vmf_ret, error;
+
+ spin_lock_irq(&mapping->tree_lock);
+ entry = get_unlocked_mapping_entry(mapping, index, &slot);
+ /* Did we race with someone splitting entry or so? */
+ if (!entry ||
+ (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
+ (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
+ put_unlocked_mapping_entry(mapping, index, entry);
+ spin_unlock_irq(&mapping->tree_lock);
+ trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
+ VM_FAULT_NOPAGE);
+ return VM_FAULT_NOPAGE;
+ }
+ radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
+ entry = lock_slot(mapping, slot);
+ spin_unlock_irq(&mapping->tree_lock);
+ switch (pe_size) {
+ case PE_SIZE_PTE:
+ error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
+ vmf_ret = dax_fault_return(error);
+ break;
+#ifdef CONFIG_FS_DAX_PMD
+ case PE_SIZE_PMD:
+ vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
+ pfn, true);
+ break;
+#endif
+ default:
+ vmf_ret = VM_FAULT_FALLBACK;
+ }
+ put_locked_mapping_entry(mapping, index);
+ trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret);
+ return vmf_ret;
+}
+
+/**
+ * dax_finish_sync_fault - finish synchronous page fault
+ * @vmf: The description of the fault
+ * @pe_size: Size of entry to be inserted
+ * @pfn: PFN to insert
+ *
+ * This function ensures that the file range touched by the page fault is
+ * stored persistently on the media and handles inserting of appropriate page
+ * table entry.
+ */
+int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+ pfn_t pfn)
+{
+ int err;
+ loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
+ size_t len = 0;
+
+ if (pe_size == PE_SIZE_PTE)
+ len = PAGE_SIZE;
+ else if (pe_size == PE_SIZE_PMD)
+ len = PMD_SIZE;
+ else
+ WARN_ON_ONCE(1);
+ err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
+ if (err)
+ return VM_FAULT_SIGBUS;
+ return dax_insert_pfn_mkwrite(vmf, pe_size, pfn);
+}
+EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index e5e29f8c920b..846ca150d52e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -36,27 +36,13 @@
#include <linux/scatterlist.h>
#include <linux/slab.h>
#include <asm/unaligned.h>
+#include <linux/kernel.h>
#include "ecryptfs_kernel.h"
#define DECRYPT 0
#define ENCRYPT 1
/**
- * ecryptfs_to_hex
- * @dst: Buffer to take hex character representation of contents of
- * src; must be at least of size (src_size * 2)
- * @src: Buffer to be converted to a hex string representation
- * @src_size: number of bytes to convert
- */
-void ecryptfs_to_hex(char *dst, char *src, size_t src_size)
-{
- int x;
-
- for (x = 0; x < src_size; x++)
- sprintf(&dst[x * 2], "%.2x", (unsigned char)src[x]);
-}
-
-/**
* ecryptfs_from_hex
* @dst: Buffer to take the bytes from src hex; must be at least of
* size (src_size / 2)
@@ -899,8 +885,7 @@ static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat,
u32 flags;
flags = get_unaligned_be32(page_virt);
- for (i = 0; i < ((sizeof(ecryptfs_flag_map)
- / sizeof(struct ecryptfs_flag_map_elem))); i++)
+ for (i = 0; i < ARRAY_SIZE(ecryptfs_flag_map); i++)
if (flags & ecryptfs_flag_map[i].file_flag) {
crypt_stat->flags |= ecryptfs_flag_map[i].local_flag;
} else
@@ -937,8 +922,7 @@ void ecryptfs_write_crypt_stat_flags(char *page_virt,
u32 flags = 0;
int i;
- for (i = 0; i < ((sizeof(ecryptfs_flag_map)
- / sizeof(struct ecryptfs_flag_map_elem))); i++)
+ for (i = 0; i < ARRAY_SIZE(ecryptfs_flag_map); i++)
if (crypt_stat->flags & ecryptfs_flag_map[i].local_flag)
flags |= ecryptfs_flag_map[i].file_flag;
/* Version is in top 8 bits of the 32-bit flag vector */
@@ -1434,8 +1418,6 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
page_virt = kmem_cache_alloc(ecryptfs_header_cache, GFP_USER);
if (!page_virt) {
rc = -ENOMEM;
- printk(KERN_ERR "%s: Unable to allocate page_virt\n",
- __func__);
goto out;
}
rc = ecryptfs_read_lower(page_virt, 0, crypt_stat->extent_size,
@@ -1522,9 +1504,6 @@ ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
filename->encrypted_filename =
kmalloc(filename->encrypted_filename_size, GFP_KERNEL);
if (!filename->encrypted_filename) {
- printk(KERN_ERR "%s: Out of memory whilst attempting "
- "to kmalloc [%zd] bytes\n", __func__,
- filename->encrypted_filename_size);
rc = -ENOMEM;
goto out;
}
@@ -1669,12 +1648,10 @@ ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name,
BUG_ON(!mutex_is_locked(&key_tfm_list_mutex));
tmp_tfm = kmem_cache_alloc(ecryptfs_key_tfm_cache, GFP_KERNEL);
- if (key_tfm != NULL)
+ if (key_tfm)
(*key_tfm) = tmp_tfm;
if (!tmp_tfm) {
rc = -ENOMEM;
- printk(KERN_ERR "Error attempting to allocate from "
- "ecryptfs_key_tfm_cache\n");
goto out;
}
mutex_init(&tmp_tfm->key_tfm_mutex);
@@ -1690,7 +1667,7 @@ ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name,
"cipher with name = [%s]; rc = [%d]\n",
tmp_tfm->cipher_name, rc);
kmem_cache_free(ecryptfs_key_tfm_cache, tmp_tfm);
- if (key_tfm != NULL)
+ if (key_tfm)
(*key_tfm) = NULL;
goto out;
}
@@ -1881,7 +1858,7 @@ ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,
size_t src_byte_offset = 0;
size_t dst_byte_offset = 0;
- if (dst == NULL) {
+ if (!dst) {
(*dst_size) = ecryptfs_max_decoded_size(src_size);
goto out;
}
@@ -1949,9 +1926,6 @@ int ecryptfs_encrypt_and_encode_filename(
filename = kzalloc(sizeof(*filename), GFP_KERNEL);
if (!filename) {
- printk(KERN_ERR "%s: Out of memory whilst attempting "
- "to kzalloc [%zd] bytes\n", __func__,
- sizeof(*filename));
rc = -ENOMEM;
goto out;
}
@@ -1980,9 +1954,6 @@ int ecryptfs_encrypt_and_encode_filename(
+ encoded_name_no_prefix_size);
(*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL);
if (!(*encoded_name)) {
- printk(KERN_ERR "%s: Out of memory whilst attempting "
- "to kzalloc [%zd] bytes\n", __func__,
- (*encoded_name_size));
rc = -ENOMEM;
kfree(filename->encrypted_filename);
kfree(filename);
@@ -2064,9 +2035,6 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
name, name_size);
decoded_name = kmalloc(decoded_name_size, GFP_KERNEL);
if (!decoded_name) {
- printk(KERN_ERR "%s: Out of memory whilst attempting "
- "to kmalloc [%zd] bytes\n", __func__,
- decoded_name_size);
rc = -ENOMEM;
goto out;
}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 3fbc0ff79699..e74cb2a0b299 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -31,6 +31,7 @@
#include <crypto/skcipher.h>
#include <keys/user-type.h>
#include <keys/encrypted-type.h>
+#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/fs_stack.h>
#include <linux/namei.h>
@@ -51,7 +52,13 @@
#define ECRYPTFS_XATTR_NAME "user.ecryptfs"
void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok);
-extern void ecryptfs_to_hex(char *dst, char *src, size_t src_size);
+static inline void
+ecryptfs_to_hex(char *dst, char *src, size_t src_size)
+{
+ char *end = bin2hex(dst, src, src_size);
+ *end = '\0';
+}
+
extern void ecryptfs_from_hex(char *dst, char *src, int dst_size);
struct ecryptfs_key_record {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index efc2db42d175..847904aa63a9 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -64,7 +64,6 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque)
/* i_size will be overwritten for encrypted regular files */
fsstack_copy_inode_size(inode, lower_inode);
inode->i_ino = lower_inode->i_ino;
- inode->i_version++;
inode->i_mapping->a_ops = &ecryptfs_aops;
if (S_ISLNK(inode->i_mode))
@@ -334,9 +333,6 @@ static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
if (!dentry_info) {
- printk(KERN_ERR "%s: Out of memory whilst attempting "
- "to allocate ecryptfs_dentry_info struct\n",
- __func__);
dput(lower_dentry);
return ERR_PTR(-ENOMEM);
}
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index fa218cd64f74..c89a58cfc991 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -639,11 +639,9 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
int rc = 0;
s = kzalloc(sizeof(*s), GFP_KERNEL);
- if (!s) {
- printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
- "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+ if (!s)
return -ENOMEM;
- }
+
(*packet_size) = 0;
rc = ecryptfs_find_auth_tok_for_sig(
&auth_tok_key,
@@ -687,7 +685,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
* separator, and then the filename */
s->max_packet_size = (ECRYPTFS_TAG_70_MAX_METADATA_SIZE
+ s->block_aligned_filename_size);
- if (dest == NULL) {
+ if (!dest) {
(*packet_size) = s->max_packet_size;
goto out_unlock;
}
@@ -714,9 +712,6 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
s->block_aligned_filename = kzalloc(s->block_aligned_filename_size,
GFP_KERNEL);
if (!s->block_aligned_filename) {
- printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
- "kzalloc [%zd] bytes\n", __func__,
- s->block_aligned_filename_size);
rc = -ENOMEM;
goto out_unlock;
}
@@ -769,10 +764,6 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
s->hash_desc = kmalloc(sizeof(*s->hash_desc) +
crypto_shash_descsize(s->hash_tfm), GFP_KERNEL);
if (!s->hash_desc) {
- printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
- "kmalloc [%zd] bytes\n", __func__,
- sizeof(*s->hash_desc) +
- crypto_shash_descsize(s->hash_tfm));
rc = -ENOMEM;
goto out_release_free_unlock;
}
@@ -925,11 +916,9 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
(*filename_size) = 0;
(*filename) = NULL;
s = kzalloc(sizeof(*s), GFP_KERNEL);
- if (!s) {
- printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
- "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+ if (!s)
return -ENOMEM;
- }
+
if (max_packet_size < ECRYPTFS_TAG_70_MIN_METADATA_SIZE) {
printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
"at least [%d]\n", __func__, max_packet_size,
@@ -1015,9 +1004,6 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
s->decrypted_filename = kmalloc(s->block_aligned_filename_size,
GFP_KERNEL);
if (!s->decrypted_filename) {
- printk(KERN_ERR "%s: Out of memory whilst attempting to "
- "kmalloc [%zd] bytes\n", __func__,
- s->block_aligned_filename_size);
rc = -ENOMEM;
goto out_unlock;
}
@@ -1097,9 +1083,6 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
}
(*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL);
if (!(*filename)) {
- printk(KERN_ERR "%s: Out of memory whilst attempting to "
- "kmalloc [%zd] bytes\n", __func__,
- ((*filename_size) + 1));
rc = -ENOMEM;
goto out_free_unlock;
}
@@ -1333,7 +1316,7 @@ parse_tag_1_packet(struct ecryptfs_crypt_stat *crypt_stat,
if ((*new_auth_tok)->session_key.encrypted_key_size
> ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
printk(KERN_WARNING "Tag 1 packet contains key larger "
- "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES");
+ "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n");
rc = -EINVAL;
goto out;
}
@@ -2525,11 +2508,9 @@ int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig)
struct ecryptfs_key_sig *new_key_sig;
new_key_sig = kmem_cache_alloc(ecryptfs_key_sig_cache, GFP_KERNEL);
- if (!new_key_sig) {
- printk(KERN_ERR
- "Error allocating from ecryptfs_key_sig_cache\n");
+ if (!new_key_sig)
return -ENOMEM;
- }
+
memcpy(new_key_sig->keysig, sig, ECRYPTFS_SIG_SIZE_HEX);
new_key_sig->keysig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
/* Caller must hold keysig_list_mutex */
@@ -2545,16 +2526,12 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
char *sig, u32 global_auth_tok_flags)
{
struct ecryptfs_global_auth_tok *new_auth_tok;
- int rc = 0;
new_auth_tok = kmem_cache_zalloc(ecryptfs_global_auth_tok_cache,
GFP_KERNEL);
- if (!new_auth_tok) {
- rc = -ENOMEM;
- printk(KERN_ERR "Error allocating from "
- "ecryptfs_global_auth_tok_cache\n");
- goto out;
- }
+ if (!new_auth_tok)
+ return -ENOMEM;
+
memcpy(new_auth_tok->sig, sig, ECRYPTFS_SIG_SIZE_HEX);
new_auth_tok->flags = global_auth_tok_flags;
new_auth_tok->sig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
@@ -2562,7 +2539,6 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
list_add(&new_auth_tok->mount_crypt_stat_list,
&mount_crypt_stat->global_auth_tok_list);
mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
-out:
- return rc;
+ return 0;
}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 25aeaa7328ba..f2677c90d96e 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -426,7 +426,7 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options,
mount_crypt_stat->global_default_cipher_key_size);
if (!cipher_code) {
ecryptfs_printk(KERN_ERR,
- "eCryptfs doesn't support cipher: %s",
+ "eCryptfs doesn't support cipher: %s\n",
mount_crypt_stat->global_default_cipher_name);
rc = -EINVAL;
goto out;
@@ -781,7 +781,7 @@ static struct attribute *attributes[] = {
NULL,
};
-static struct attribute_group attr_group = {
+static const struct attribute_group attr_group = {
.attrs = attributes,
};
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 286f10b0363b..9fdd5bcf4564 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -147,8 +147,6 @@ ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, struct file *file)
(*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL);
if (!(*daemon)) {
rc = -ENOMEM;
- printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
- "GFP_KERNEL memory\n", __func__, sizeof(**daemon));
goto out;
}
(*daemon)->file = file;
@@ -250,8 +248,6 @@ int ecryptfs_process_response(struct ecryptfs_daemon *daemon,
msg_ctx->msg = kmemdup(msg, msg_size, GFP_KERNEL);
if (!msg_ctx->msg) {
rc = -ENOMEM;
- printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
- "GFP_KERNEL memory\n", __func__, msg_size);
goto unlock;
}
msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE;
@@ -386,7 +382,6 @@ int __init ecryptfs_init_messaging(void)
GFP_KERNEL);
if (!ecryptfs_daemon_hash) {
rc = -ENOMEM;
- printk(KERN_ERR "%s: Failed to allocate memory\n", __func__);
mutex_unlock(&ecryptfs_daemon_hash_mux);
goto out;
}
@@ -398,7 +393,6 @@ int __init ecryptfs_init_messaging(void)
GFP_KERNEL);
if (!ecryptfs_msg_ctx_arr) {
rc = -ENOMEM;
- printk(KERN_ERR "%s: Failed to allocate memory\n", __func__);
goto out;
}
mutex_init(&ecryptfs_msg_ctx_lists_mux);
@@ -442,15 +436,16 @@ void ecryptfs_release_messaging(void)
}
if (ecryptfs_daemon_hash) {
struct ecryptfs_daemon *daemon;
+ struct hlist_node *n;
int i;
mutex_lock(&ecryptfs_daemon_hash_mux);
for (i = 0; i < (1 << ecryptfs_hash_bits); i++) {
int rc;
- hlist_for_each_entry(daemon,
- &ecryptfs_daemon_hash[i],
- euid_chain) {
+ hlist_for_each_entry_safe(daemon, n,
+ &ecryptfs_daemon_hash[i],
+ euid_chain) {
rc = ecryptfs_exorcise_daemon(daemon);
if (rc)
printk(KERN_ERR "%s: Error whilst "
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index e4141f257495..f09cacaf8c80 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -163,12 +163,8 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
struct ecryptfs_message *msg;
msg = kmalloc((sizeof(*msg) + data_size), GFP_KERNEL);
- if (!msg) {
- printk(KERN_ERR "%s: Out of memory whilst attempting "
- "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
- (sizeof(*msg) + data_size));
+ if (!msg)
return -ENOMEM;
- }
mutex_lock(&msg_ctx->mux);
msg_ctx->msg = msg;
@@ -383,7 +379,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
goto memdup;
} else if (count < MIN_MSG_PKT_SIZE || count > MAX_MSG_PKT_SIZE) {
printk(KERN_WARNING "%s: Acceptable packet size range is "
- "[%d-%zu], but amount of data written is [%zu].",
+ "[%d-%zu], but amount of data written is [%zu].\n",
__func__, MIN_MSG_PKT_SIZE, MAX_MSG_PKT_SIZE, count);
return -EINVAL;
}
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 1f0c471b4ba3..cdf358b209d9 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -431,8 +431,6 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
}
xattr_virt = kmem_cache_alloc(ecryptfs_xattr_cache, GFP_KERNEL);
if (!xattr_virt) {
- printk(KERN_ERR "Out of memory whilst attempting to write "
- "inode size to xattr\n");
rc = -ENOMEM;
goto out;
}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..afd548ebc328 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -276,12 +276,6 @@ static DEFINE_MUTEX(epmutex);
/* Used to check for epoll file descriptor inclusion loops */
static struct nested_calls poll_loop_ncalls;
-/* Used for safe wake up implementation */
-static struct nested_calls poll_safewake_ncalls;
-
-/* Used to call file's f_op->poll() under the nested calls boundaries */
-static struct nested_calls poll_readywalk_ncalls;
-
/* Slab cache used to allocate "struct epitem" */
static struct kmem_cache *epi_cache __read_mostly;
@@ -551,40 +545,21 @@ out_unlock:
* this special case of epoll.
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
- unsigned long events, int subclass)
+
+static struct nested_calls poll_safewake_ncalls;
+
+static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
{
unsigned long flags;
+ wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie;
- spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
- wake_up_locked_poll(wqueue, events);
+ spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1);
+ wake_up_locked_poll(wqueue, POLLIN);
spin_unlock_irqrestore(&wqueue->lock, flags);
-}
-#else
-static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
- unsigned long events, int subclass)
-{
- wake_up_poll(wqueue, events);
-}
-#endif
-static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
-{
- ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
- 1 + call_nests);
return 0;
}
-/*
- * Perform a safe wake up of the poll wait list. The problem is that
- * with the new callback'd wake up system, it is possible that the
- * poll callback is reentered from inside the call to wake_up() done
- * on the poll wait queue head. The rule is that we cannot reenter the
- * wake up code from the same task more than EP_MAX_NESTS times,
- * and we cannot reenter the same wait queue head at all. This will
- * enable to have a hierarchy of epoll file descriptor of no more than
- * EP_MAX_NESTS deep.
- */
static void ep_poll_safewake(wait_queue_head_t *wq)
{
int this_cpu = get_cpu();
@@ -595,6 +570,15 @@ static void ep_poll_safewake(wait_queue_head_t *wq)
put_cpu();
}
+#else
+
+static void ep_poll_safewake(wait_queue_head_t *wq)
+{
+ wake_up_poll(wq, POLLIN);
+}
+
+#endif
+
static void ep_remove_wait_queue(struct eppoll_entry *pwq)
{
wait_queue_head_t *whead;
@@ -880,11 +864,33 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
return 0;
}
-static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt)
+static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
+ void *priv);
+static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
+ poll_table *pt);
+
+/*
+ * Differs from ep_eventpoll_poll() in that internal callers already have
+ * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
+ * is correctly annotated.
+ */
+static unsigned int ep_item_poll(struct epitem *epi, poll_table *pt, int depth)
{
+ struct eventpoll *ep;
+ bool locked;
+
pt->_key = epi->event.events;
+ if (!is_file_epoll(epi->ffd.file))
+ return epi->ffd.file->f_op->poll(epi->ffd.file, pt) &
+ epi->event.events;
+
+ ep = epi->ffd.file->private_data;
+ poll_wait(epi->ffd.file, &ep->poll_wait, pt);
+ locked = pt && (pt->_qproc == ep_ptable_queue_proc);
- return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
+ return ep_scan_ready_list(epi->ffd.file->private_data,
+ ep_read_events_proc, &depth, depth,
+ locked) & epi->event.events;
}
static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
@@ -892,13 +898,15 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
{
struct epitem *epi, *tmp;
poll_table pt;
+ int depth = *(int *)priv;
init_poll_funcptr(&pt, NULL);
+ depth++;
list_for_each_entry_safe(epi, tmp, head, rdllink) {
- if (ep_item_poll(epi, &pt))
+ if (ep_item_poll(epi, &pt, depth)) {
return POLLIN | POLLRDNORM;
- else {
+ } else {
/*
* Item has been dropped into the ready list by the poll
* callback, but it's not actually ready, as far as
@@ -912,48 +920,20 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
return 0;
}
-static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
- poll_table *pt);
-
-struct readyevents_arg {
- struct eventpoll *ep;
- bool locked;
-};
-
-static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
-{
- struct readyevents_arg *arg = priv;
-
- return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL,
- call_nests + 1, arg->locked);
-}
-
static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
{
- int pollflags;
struct eventpoll *ep = file->private_data;
- struct readyevents_arg arg;
-
- /*
- * During ep_insert() we already hold the ep->mtx for the tfile.
- * Prevent re-aquisition.
- */
- arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc);
- arg.ep = ep;
+ int depth = 0;
/* Insert inside our poll wait queue */
poll_wait(file, &ep->poll_wait, wait);
/*
* Proceed to find out if wanted events are really available inside
- * the ready list. This need to be done under ep_call_nested()
- * supervision, since the call to f_op->poll() done on listed files
- * could re-enter here.
+ * the ready list.
*/
- pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
- ep_poll_readyevents_proc, &arg, ep, current);
-
- return pollflags != -1 ? pollflags : 0;
+ return ep_scan_ready_list(ep, ep_read_events_proc,
+ &depth, depth, false);
}
#ifdef CONFIG_PROC_FS
@@ -1472,7 +1452,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
* this operation completes, the poll callback can start hitting
* the new item.
*/
- revents = ep_item_poll(epi, &epq.pt);
+ revents = ep_item_poll(epi, &epq.pt, 1);
/*
* We have to check if something went wrong during the poll wait queue
@@ -1606,7 +1586,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
* Get current event bits. We can safely use the file* here because
* its usage count has been increased by the caller of this function.
*/
- revents = ep_item_poll(epi, &pt);
+ revents = ep_item_poll(epi, &pt, 1);
/*
* If the item is "hot" and it is not registered inside the ready
@@ -1674,7 +1654,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
list_del_init(&epi->rdllink);
- revents = ep_item_poll(epi, &pt);
+ revents = ep_item_poll(epi, &pt, 1);
/*
* If the event mask intersect the caller-requested one,
@@ -2259,7 +2239,6 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
compat_size_t, sigsetsize)
{
long err;
- compat_sigset_t csigmask;
sigset_t ksigmask, sigsaved;
/*
@@ -2269,9 +2248,8 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
if (sigmask) {
if (sigsetsize != sizeof(compat_sigset_t))
return -EINVAL;
- if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
+ if (get_compat_sigset(&ksigmask, sigmask))
return -EFAULT;
- sigset_from_compat(&ksigmask, &csigmask);
sigsaved = current->blocked;
set_current_blocked(&ksigmask);
}
@@ -2315,11 +2293,10 @@ static int __init eventpoll_init(void)
*/
ep_nested_calls_init(&poll_loop_ncalls);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
/* Initialize the structure used to perform safe poll wait head wake ups */
ep_nested_calls_init(&poll_safewake_ncalls);
-
- /* Initialize the structure used to perform file's f_op->poll() calls */
- ep_nested_calls_init(&poll_readywalk_ncalls);
+#endif
/*
* We can have many thousands of epitems, so prevent this from
@@ -2329,11 +2306,11 @@ static int __init eventpoll_init(void)
/* Allocates slab cache used to allocate "struct epitem" items */
epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
- 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
/* Allocates slab cache used to allocate "struct eppoll_entry" */
pwq_cache = kmem_cache_create("eventpoll_pwq",
- sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
+ sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
return 0;
}
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index c67b486488fd..2da67699dc33 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -100,7 +100,7 @@ static int ext2_dax_fault(struct vm_fault *vmf)
}
down_read(&ei->dax_sem);
- ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &ext2_iomap_ops);
+ ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &ext2_iomap_ops);
up_read(&ei->dax_sem);
if (vmf->flags & FAULT_FLAG_WRITE)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ad204d2724ac..a0ae27b1bc66 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -28,6 +28,7 @@
#include <linux/quotaops.h>
#include <linux/pagevec.h>
#include <linux/uio.h>
+#include <linux/mman.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -297,6 +298,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
*/
bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
(vmf->vma->vm_flags & VM_SHARED);
+ pfn_t pfn;
if (write) {
sb_start_pagefault(sb);
@@ -304,16 +306,20 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
down_read(&EXT4_I(inode)->i_mmap_sem);
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
EXT4_DATA_TRANS_BLOCKS(sb));
+ if (IS_ERR(handle)) {
+ up_read(&EXT4_I(inode)->i_mmap_sem);
+ sb_end_pagefault(sb);
+ return VM_FAULT_SIGBUS;
+ }
} else {
down_read(&EXT4_I(inode)->i_mmap_sem);
}
- if (!IS_ERR(handle))
- result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops);
- else
- result = VM_FAULT_SIGBUS;
+ result = dax_iomap_fault(vmf, pe_size, &pfn, &ext4_iomap_ops);
if (write) {
- if (!IS_ERR(handle))
- ext4_journal_stop(handle);
+ ext4_journal_stop(handle);
+ /* Handling synchronous page fault? */
+ if (result & VM_FAULT_NEEDDSYNC)
+ result = dax_finish_sync_fault(vmf, pe_size, pfn);
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);
} else {
@@ -351,6 +357,13 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
+ /*
+ * We don't support synchronous mappings for non-DAX files. At least
+ * until someone comes with a sensible use case.
+ */
+ if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
+ return -EOPNOTSUPP;
+
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
@@ -469,6 +482,7 @@ const struct file_operations ext4_file_operations = {
.compat_ioctl = ext4_compat_ioctl,
#endif
.mmap = ext4_file_mmap,
+ .mmap_supported_flags = MAP_SYNC,
.open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d2b582fb141..0992d76f7ab1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3384,6 +3384,19 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
return try_to_free_buffers(page);
}
+static bool ext4_inode_datasync_dirty(struct inode *inode)
+{
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+
+ if (journal)
+ return !jbd2_transaction_committed(journal,
+ EXT4_I(inode)->i_datasync_tid);
+ /* Any metadata buffers to write? */
+ if (!list_empty(&inode->i_mapping->private_list))
+ return true;
+ return inode->i_state & I_DIRTY_DATASYNC;
+}
+
static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned flags, struct iomap *iomap)
{
@@ -3497,6 +3510,8 @@ retry:
}
iomap->flags = 0;
+ if (ext4_inode_datasync_dirty(inode))
+ iomap->flags |= IOMAP_F_DIRTY;
iomap->bdev = inode->i_sb->s_bdev;
iomap->dax_dev = sbi->s_daxdev;
iomap->offset = first_block << blkbits;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index b7558f292420..1eec25014f62 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -592,6 +592,44 @@ static int ext4_ioc_getfsmap(struct super_block *sb,
return 0;
}
+static long ext4_ioctl_group_add(struct file *file,
+ struct ext4_new_group_data *input)
+{
+ struct super_block *sb = file_inode(file)->i_sb;
+ int err, err2=0;
+
+ err = ext4_resize_begin(sb);
+ if (err)
+ return err;
+
+ if (ext4_has_feature_bigalloc(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online resizing not supported with bigalloc");
+ err = -EOPNOTSUPP;
+ goto group_add_out;
+ }
+
+ err = mnt_want_write_file(file);
+ if (err)
+ goto group_add_out;
+
+ err = ext4_group_add(sb, input);
+ if (EXT4_SB(sb)->s_journal) {
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ }
+ if (err == 0)
+ err = err2;
+ mnt_drop_write_file(file);
+ if (!err && ext4_has_group_desc_csum(sb) &&
+ test_opt(sb, INIT_INODE_TABLE))
+ err = ext4_register_li_request(sb, input->group);
+group_add_out:
+ ext4_resize_end(sb);
+ return err;
+}
+
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -776,44 +814,12 @@ mext_out:
case EXT4_IOC_GROUP_ADD: {
struct ext4_new_group_data input;
- int err, err2=0;
-
- err = ext4_resize_begin(sb);
- if (err)
- return err;
if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
- sizeof(input))) {
- err = -EFAULT;
- goto group_add_out;
- }
-
- if (ext4_has_feature_bigalloc(sb)) {
- ext4_msg(sb, KERN_ERR,
- "Online resizing not supported with bigalloc");
- err = -EOPNOTSUPP;
- goto group_add_out;
- }
-
- err = mnt_want_write_file(filp);
- if (err)
- goto group_add_out;
+ sizeof(input)))
+ return -EFAULT;
- err = ext4_group_add(sb, &input);
- if (EXT4_SB(sb)->s_journal) {
- jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
- jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
- }
- if (err == 0)
- err = err2;
- mnt_drop_write_file(filp);
- if (!err && ext4_has_group_desc_csum(sb) &&
- test_opt(sb, INIT_INODE_TABLE))
- err = ext4_register_li_request(sb, input.group);
-group_add_out:
- ext4_resize_end(sb);
- return err;
+ return ext4_ioctl_group_add(filp, &input);
}
case EXT4_IOC_MIGRATE:
@@ -1078,8 +1084,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
break;
case EXT4_IOC32_GROUP_ADD: {
struct compat_ext4_new_group_input __user *uinput;
- struct ext4_new_group_input input;
- mm_segment_t old_fs;
+ struct ext4_new_group_data input;
int err;
uinput = compat_ptr(arg);
@@ -1092,12 +1097,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
&uinput->reserved_blocks);
if (err)
return -EFAULT;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
- (unsigned long) &input);
- set_fs(old_fs);
- return err;
+ return ext4_ioctl_group_add(file, &input);
}
case EXT4_IOC_MOVE_EXT:
case EXT4_IOC_RESIZE_FS:
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 81cecbe6d7cf..b833ffeee1e1 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -291,7 +291,6 @@ static int fat_parse_long(struct inode *dir, loff_t *pos,
}
}
parse_long:
- slots = 0;
ds = (struct msdos_dir_slot *)*de;
id = ds->id;
if (!(id & 0x40))
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 30f47d0f74a0..0522e283a4f4 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -563,6 +563,9 @@ static int put_compat_flock64(const struct flock *kfl, struct compat_flock64 __u
{
struct compat_flock64 fl;
+ BUILD_BUG_ON(sizeof(kfl->l_start) > sizeof(ufl->l_start));
+ BUILD_BUG_ON(sizeof(kfl->l_len) > sizeof(ufl->l_len));
+
memset(&fl, 0, sizeof(struct compat_flock64));
copy_flock_fields(&fl, kfl);
if (copy_to_user(ufl, &fl, sizeof(struct compat_flock64)))
@@ -632,9 +635,8 @@ COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
if (err)
break;
err = fixup_compat_flock(&flock);
- if (err)
- return err;
- err = put_compat_flock(&flock, compat_ptr(arg));
+ if (!err)
+ err = put_compat_flock(&flock, compat_ptr(arg));
break;
case F_GETLK64:
case F_OFD_GETLK:
@@ -642,12 +644,8 @@ COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
if (err)
break;
err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
- if (err)
- break;
- err = fixup_compat_flock(&flock);
- if (err)
- return err;
- err = put_compat_flock64(&flock, compat_ptr(arg));
+ if (!err)
+ err = put_compat_flock64(&flock, compat_ptr(arg));
break;
case F_SETLK:
case F_SETLKW:
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 474adc8d2a3a..0ace128f5d23 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -213,8 +213,8 @@ out_err:
return retval;
}
-long do_handle_open(int mountdirfd,
- struct file_handle __user *ufh, int open_flag)
+static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
+ int open_flag)
{
long retval = 0;
struct path path;
diff --git a/fs/file.c b/fs/file.c
index 4eecbf4244a5..3b080834b870 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -593,13 +593,16 @@ void __fd_install(struct files_struct *files, unsigned int fd,
{
struct fdtable *fdt;
- might_sleep();
rcu_read_lock_sched();
- while (unlikely(files->resize_in_progress)) {
+ if (unlikely(files->resize_in_progress)) {
rcu_read_unlock_sched();
- wait_event(files->resize_wait, !files->resize_in_progress);
- rcu_read_lock_sched();
+ spin_lock(&files->file_lock);
+ fdt = files_fdtable(files);
+ BUG_ON(fdt->fd[fd] != NULL);
+ rcu_assign_pointer(fdt->fd[fd], file);
+ spin_unlock(&files->file_lock);
+ return;
}
/* coupled with smp_wmb() in expand_fdtable() */
smp_rmb();
@@ -632,7 +635,6 @@ int __close_fd(struct files_struct *files, unsigned fd)
if (!file)
goto out_unlock;
rcu_assign_pointer(fdt->fd[fd], NULL);
- __clear_close_on_exec(fd, fdt);
__put_unused_fd(files, fd);
spin_unlock(&files->file_lock);
return filp_close(file, files);
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 8aec5e732abf..b63a4df7327b 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -98,13 +98,11 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
struct hfs_bnode *src_node, int src, int len)
{
- struct hfs_btree *tree;
struct page *src_page, *dst_page;
hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
if (!len)
return;
- tree = src_node->tree;
src += src_node->page_offset;
dst += dst_node->page_offset;
src_page = src_node->page[0];
@@ -237,7 +235,6 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
{
- struct super_block *sb;
struct hfs_bnode *node, *node2;
struct address_space *mapping;
struct page *page;
@@ -249,7 +246,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
return NULL;
}
- sb = tree->inode->i_sb;
size = sizeof(struct hfs_bnode) + tree->pages_per_bnode *
sizeof(struct page *);
node = kzalloc(size, GFP_KERNEL);
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index d77015c3f22c..177fae4e6581 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -127,14 +127,12 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
struct hfs_bnode *src_node, int src, int len)
{
- struct hfs_btree *tree;
struct page **src_page, **dst_page;
int l;
hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
if (!len)
return;
- tree = src_node->tree;
src += src_node->page_offset;
dst += dst_node->page_offset;
src_page = src_node->page + (src >> PAGE_SHIFT);
@@ -401,7 +399,6 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
{
- struct super_block *sb;
struct hfs_bnode *node, *node2;
struct address_space *mapping;
struct page *page;
@@ -414,7 +411,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
return NULL;
}
- sb = tree->inode->i_sb;
size = sizeof(struct hfs_bnode) + tree->pages_per_bnode *
sizeof(struct page *);
node = kzalloc(size, GFP_KERNEL);
diff --git a/fs/internal.h b/fs/internal.h
index 48cee21b4f14..df262f41a0ef 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -55,6 +55,7 @@ extern void __init chrdev_init(void);
extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
const char *, unsigned int, struct path *);
+long do_unlinkat(int dfd, struct filename *name);
/*
* namespace.c
diff --git a/fs/iomap.c b/fs/iomap.c
index b9f74803e56c..47d29ccffaef 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -856,6 +856,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
struct bio *bio;
bool need_zeroout = false;
int nr_pages, ret;
+ size_t copied = 0;
if ((pos | length | align) & ((1 << blkbits) - 1))
return -EINVAL;
@@ -867,7 +868,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
/*FALLTHRU*/
case IOMAP_UNWRITTEN:
if (!(dio->flags & IOMAP_DIO_WRITE)) {
- iov_iter_zero(length, dio->submit.iter);
+ length = iov_iter_zero(length, dio->submit.iter);
dio->size += length;
return length;
}
@@ -904,8 +905,11 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
}
do {
- if (dio->error)
+ size_t n;
+ if (dio->error) {
+ iov_iter_revert(dio->submit.iter, copied);
return 0;
+ }
bio = bio_alloc(GFP_KERNEL, nr_pages);
bio_set_dev(bio, iomap->bdev);
@@ -918,20 +922,24 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
ret = bio_iov_iter_get_pages(bio, &iter);
if (unlikely(ret)) {
bio_put(bio);
- return ret;
+ return copied ? copied : ret;
}
+ n = bio->bi_iter.bi_size;
if (dio->flags & IOMAP_DIO_WRITE) {
bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
- task_io_account_write(bio->bi_iter.bi_size);
+ task_io_account_write(n);
} else {
bio_set_op_attrs(bio, REQ_OP_READ, 0);
if (dio->flags & IOMAP_DIO_DIRTY)
bio_set_pages_dirty(bio);
}
- dio->size += bio->bi_iter.bi_size;
- pos += bio->bi_iter.bi_size;
+ iov_iter_advance(dio->submit.iter, n);
+
+ dio->size += n;
+ pos += n;
+ copied += n;
nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
@@ -947,9 +955,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
if (pad)
iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
}
-
- iov_iter_advance(dio->submit.iter, length);
- return length;
+ return copied;
}
ssize_t
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index d2a85c9720e9..67546c7ad473 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -737,6 +737,23 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
return err;
}
+/* Return 1 when transaction with given tid has already committed. */
+int jbd2_transaction_committed(journal_t *journal, tid_t tid)
+{
+ int ret = 1;
+
+ read_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction &&
+ journal->j_running_transaction->t_tid == tid)
+ ret = 0;
+ if (journal->j_committing_transaction &&
+ journal->j_committing_transaction->t_tid == tid)
+ ret = 0;
+ read_unlock(&journal->j_state_lock);
+ return ret;
+}
+EXPORT_SYMBOL(jbd2_transaction_committed);
+
/*
* When this function returns the transaction corresponding to tid
* will be completed. If the transaction has currently running, start
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index b837fb7e290a..a8e3777c94dc 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -369,6 +369,7 @@ static int lockd_start_svc(struct svc_serv *serv)
printk(KERN_WARNING
"lockd_up: svc_rqst allocation failed, error=%d\n",
error);
+ lockd_unregister_notifiers();
goto out_rqst;
}
@@ -459,13 +460,16 @@ int lockd_up(struct net *net)
}
error = lockd_up_net(serv, net);
- if (error < 0)
- goto err_net;
+ if (error < 0) {
+ lockd_unregister_notifiers();
+ goto err_put;
+ }
error = lockd_start_svc(serv);
- if (error < 0)
- goto err_start;
-
+ if (error < 0) {
+ lockd_down_net(serv, net);
+ goto err_put;
+ }
nlmsvc_users++;
/*
* Note: svc_serv structures have an initial use count of 1,
@@ -476,12 +480,6 @@ err_put:
err_create:
mutex_unlock(&nlmsvc_mutex);
return error;
-
-err_start:
- lockd_down_net(serv, net);
-err_net:
- lockd_unregister_notifiers();
- goto err_put;
}
EXPORT_SYMBOL_GPL(lockd_up);
diff --git a/fs/namei.c b/fs/namei.c
index 5424b10cfdc4..f0c7a7b9b6ca 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3459,7 +3459,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
goto out;
child = vfs_tmpfile(path.dentry, op->mode, op->open_flag);
error = PTR_ERR(child);
- if (unlikely(IS_ERR(child)))
+ if (IS_ERR(child))
goto out2;
dput(path.dentry);
path.dentry = child;
@@ -4010,10 +4010,9 @@ EXPORT_SYMBOL(vfs_unlink);
* writeout happening, and we don't want to prevent access to the directory
* while waiting on the I/O.
*/
-static long do_unlinkat(int dfd, const char __user *pathname)
+long do_unlinkat(int dfd, struct filename *name)
{
int error;
- struct filename *name;
struct dentry *dentry;
struct path path;
struct qstr last;
@@ -4022,8 +4021,7 @@ static long do_unlinkat(int dfd, const char __user *pathname)
struct inode *delegated_inode = NULL;
unsigned int lookup_flags = 0;
retry:
- name = filename_parentat(dfd, getname(pathname), lookup_flags,
- &path, &last, &type);
+ name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
if (IS_ERR(name))
return PTR_ERR(name);
@@ -4065,12 +4063,12 @@ exit2:
mnt_drop_write(path.mnt);
exit1:
path_put(&path);
- putname(name);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
inode = NULL;
goto retry;
}
+ putname(name);
return error;
slashes:
@@ -4091,12 +4089,12 @@ SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
if (flag & AT_REMOVEDIR)
return do_rmdir(dfd, pathname);
- return do_unlinkat(dfd, pathname);
+ return do_unlinkat(dfd, getname(pathname));
}
SYSCALL_DEFINE1(unlink, const char __user *, pathname)
{
- return do_unlinkat(AT_FDCWD, pathname);
+ return do_unlinkat(AT_FDCWD, getname(pathname));
}
int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index b60627bcfc62..ef6729568432 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -67,7 +67,7 @@ out:
*/
void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq)
{
- if (atomic_dec_and_test(&dreq->count))
+ if (refcount_dec_and_test(&dreq->count))
kfree(dreq);
}
@@ -87,7 +87,7 @@ static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req)
dreq = container_of(req, struct nfs_cache_defer_req, req);
dreq->deferred_req.revisit = nfs_dns_cache_revisit;
- atomic_inc(&dreq->count);
+ refcount_inc(&dreq->count);
return &dreq->deferred_req;
}
@@ -99,7 +99,7 @@ struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void)
dreq = kzalloc(sizeof(*dreq), GFP_KERNEL);
if (dreq) {
init_completion(&dreq->completion);
- atomic_set(&dreq->count, 1);
+ refcount_set(&dreq->count, 1);
dreq->req.defer = nfs_dns_cache_defer;
}
return dreq;
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
index 4e6236a86cf7..220ee409abc4 100644
--- a/fs/nfs/cache_lib.h
+++ b/fs/nfs/cache_lib.h
@@ -16,7 +16,7 @@ struct nfs_cache_defer_req {
struct cache_req req;
struct cache_deferred_req deferred_req;
struct completion completion;
- atomic_t count;
+ refcount_t count;
};
extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index cd9d992feb2e..509dc5adeb8f 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -49,15 +49,15 @@ static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net)
if (ret <= 0)
goto out_err;
nn->nfs_callback_tcpport = ret;
- dprintk("NFS: Callback listener port = %u (af %u, net %p)\n",
- nn->nfs_callback_tcpport, PF_INET, net);
+ dprintk("NFS: Callback listener port = %u (af %u, net %x)\n",
+ nn->nfs_callback_tcpport, PF_INET, net->ns.inum);
ret = svc_create_xprt(serv, "tcp", net, PF_INET6,
nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
if (ret > 0) {
nn->nfs_callback_tcpport6 = ret;
- dprintk("NFS: Callback listener port = %u (af %u, net %p)\n",
- nn->nfs_callback_tcpport6, PF_INET6, net);
+ dprintk("NFS: Callback listener port = %u (af %u, net %x\n",
+ nn->nfs_callback_tcpport6, PF_INET6, net->ns.inum);
} else if (ret != -EAFNOSUPPORT)
goto out_err;
return 0;
@@ -185,7 +185,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc
if (--nn->cb_users[minorversion])
return;
- dprintk("NFS: destroy per-net callback data; net=%p\n", net);
+ dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum);
svc_shutdown_net(serv, net);
}
@@ -198,7 +198,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
if (nn->cb_users[minorversion]++)
return 0;
- dprintk("NFS: create per-net callback data; net=%p\n", net);
+ dprintk("NFS: create per-net callback data; net=%x\n", net->ns.inum);
ret = svc_bind(serv, net);
if (ret < 0) {
@@ -223,7 +223,7 @@ err_socks:
err_bind:
nn->cb_users[minorversion]--;
dprintk("NFS: Couldn't create callback socket: err = %d; "
- "net = %p\n", ret, net);
+ "net = %x\n", ret, net->ns.inum);
return ret;
}
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 19151f6c0e97..2435af56b87e 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -440,7 +440,7 @@ static bool referring_call_exists(struct nfs_client *clp,
uint32_t nrclists,
struct referring_call_list *rclists)
{
- bool status = 0;
+ bool status = false;
int i, j;
struct nfs4_session *session;
struct nfs4_slot_table *tbl;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 22880ef6d8dd..0ac2fb1c6b63 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -163,7 +163,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
clp->rpc_ops = clp->cl_nfs_mod->rpc_ops;
- atomic_set(&clp->cl_count, 1);
+ refcount_set(&clp->cl_count, 1);
clp->cl_cons_state = NFS_CS_INITING;
memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen);
@@ -269,7 +269,7 @@ void nfs_put_client(struct nfs_client *clp)
nn = net_generic(clp->cl_net, nfs_net_id);
- if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) {
+ if (refcount_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) {
list_del(&clp->cl_share_link);
nfs_cb_idr_remove_locked(clp);
spin_unlock(&nn->nfs_client_lock);
@@ -314,7 +314,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
sap))
continue;
- atomic_inc(&clp->cl_count);
+ refcount_inc(&clp->cl_count);
return clp;
}
return NULL;
@@ -1006,7 +1006,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
/* Copy data from the source */
server->nfs_client = source->nfs_client;
server->destroy = source->destroy;
- atomic_inc(&server->nfs_client->cl_count);
+ refcount_inc(&server->nfs_client->cl_count);
nfs_server_copy_userdata(server, source);
server->fsid = fattr->fsid;
@@ -1166,7 +1166,7 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
clp->rpc_ops->version,
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
- atomic_read(&clp->cl_count),
+ refcount_read(&clp->cl_count),
clp->cl_hostname);
rcu_read_unlock();
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 606dd3871f66..ade44ca0c66c 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -1041,6 +1041,33 @@ int nfs_delegations_present(struct nfs_client *clp)
}
/**
+ * nfs4_refresh_delegation_stateid - Update delegation stateid seqid
+ * @dst: stateid to refresh
+ * @inode: inode to check
+ *
+ * Returns "true" and updates "dst->seqid" * if inode had a delegation
+ * that matches our delegation stateid. Otherwise "false" is returned.
+ */
+bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+ bool ret = false;
+ if (!inode)
+ goto out;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation != NULL &&
+ nfs4_stateid_match_other(dst, &delegation->stateid)) {
+ dst->seqid = delegation->stateid.seqid;
+ return ret;
+ }
+ rcu_read_unlock();
+out:
+ return ret;
+}
+
+/**
* nfs4_copy_delegation_stateid - Copy inode's state ID information
* @inode: inode to check
* @flags: delegation type requirement
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index ddaf2644cf13..185a09f37a89 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -62,6 +62,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type);
int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid);
bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, struct rpc_cred **cred);
+bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
int nfs4_have_delegation(struct inode *inode, fmode_t flags);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f439f1c45008..e51ae52ed14f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -118,13 +118,6 @@ nfs_opendir(struct inode *inode, struct file *filp)
goto out;
}
filp->private_data = ctx;
- if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) {
- /* This is a mountpoint, so d_revalidate will never
- * have been called, so we need to refresh the
- * inode (for close-open consistency) ourselves.
- */
- __nfs_revalidate_inode(NFS_SERVER(inode), inode);
- }
out:
put_rpccred(cred);
return res;
@@ -253,7 +246,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
desc->cache_entry_index = index;
return 0;
out_eof:
- desc->eof = 1;
+ desc->eof = true;
return -EBADCOOKIE;
}
@@ -307,7 +300,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
if (array->eof_index >= 0) {
status = -EBADCOOKIE;
if (*desc->dir_cookie == array->last_cookie)
- desc->eof = 1;
+ desc->eof = true;
}
out:
return status;
@@ -761,7 +754,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
ent = &array->array[i];
if (!dir_emit(desc->ctx, ent->string.name, ent->string.len,
nfs_compat_user_ino64(ent->ino), ent->d_type)) {
- desc->eof = 1;
+ desc->eof = true;
break;
}
desc->ctx->pos++;
@@ -773,7 +766,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
ctx->duped = 1;
}
if (array->eof_index >= 0)
- desc->eof = 1;
+ desc->eof = true;
kunmap(desc->page);
cache_page_release(desc);
@@ -873,7 +866,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
if (res == -EBADCOOKIE) {
res = 0;
/* This means either end of directory */
- if (*desc->dir_cookie && desc->eof == 0) {
+ if (*desc->dir_cookie && !desc->eof) {
/* Or that the server has 'lost' a cookie */
res = uncached_readdir(desc);
if (res == 0)
@@ -1241,8 +1234,7 @@ static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags)
return 0;
}
- if (nfs_mapping_need_revalidate_inode(inode))
- error = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ error = nfs_lookup_verify_inode(inode, flags);
dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n",
__func__, inode->i_ino, error ? "invalid" : "valid");
return !error;
@@ -1393,6 +1385,7 @@ static int nfs4_lookup_revalidate(struct dentry *, unsigned int);
const struct dentry_operations nfs4_dentry_operations = {
.d_revalidate = nfs4_lookup_revalidate,
+ .d_weak_revalidate = nfs_weak_revalidate,
.d_delete = nfs_dentry_delete,
.d_iput = nfs_dentry_iput,
.d_automount = nfs_d_automount,
@@ -2064,7 +2057,7 @@ out:
* should mark the directories for revalidation.
*/
d_move(old_dentry, new_dentry);
- nfs_set_verifier(new_dentry,
+ nfs_set_verifier(old_dentry,
nfs_save_change_attribute(new_dir));
} else if (error == -ENOENT)
nfs_dentry_handle_enoent(old_dentry);
@@ -2369,15 +2362,15 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
}
EXPORT_SYMBOL_GPL(nfs_access_add_cache);
-#define NFS_MAY_READ (NFS4_ACCESS_READ)
-#define NFS_MAY_WRITE (NFS4_ACCESS_MODIFY | \
- NFS4_ACCESS_EXTEND | \
- NFS4_ACCESS_DELETE)
-#define NFS_FILE_MAY_WRITE (NFS4_ACCESS_MODIFY | \
- NFS4_ACCESS_EXTEND)
+#define NFS_MAY_READ (NFS_ACCESS_READ)
+#define NFS_MAY_WRITE (NFS_ACCESS_MODIFY | \
+ NFS_ACCESS_EXTEND | \
+ NFS_ACCESS_DELETE)
+#define NFS_FILE_MAY_WRITE (NFS_ACCESS_MODIFY | \
+ NFS_ACCESS_EXTEND)
#define NFS_DIR_MAY_WRITE NFS_MAY_WRITE
-#define NFS_MAY_LOOKUP (NFS4_ACCESS_LOOKUP)
-#define NFS_MAY_EXECUTE (NFS4_ACCESS_EXECUTE)
+#define NFS_MAY_LOOKUP (NFS_ACCESS_LOOKUP)
+#define NFS_MAY_EXECUTE (NFS_ACCESS_EXECUTE)
static int
nfs_access_calc_mask(u32 access_result, umode_t umode)
{
@@ -2425,9 +2418,14 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
if (!may_block)
goto out;
- /* Be clever: ask server to check for all possible rights */
- cache.mask = NFS_MAY_LOOKUP | NFS_MAY_EXECUTE
- | NFS_MAY_WRITE | NFS_MAY_READ;
+ /*
+ * Determine which access bits we want to ask for...
+ */
+ cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND;
+ if (S_ISDIR(inode->i_mode))
+ cache.mask |= NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP;
+ else
+ cache.mask |= NFS_ACCESS_EXECUTE;
cache.cred = cred;
status = NFS_PROTO(inode)->access(inode, &cache);
if (status != 0) {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 0214dd1e1060..81cca49a8375 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -829,23 +829,9 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
is_local = 1;
- /*
- * VFS doesn't require the open mode to match a flock() lock's type.
- * NFS, however, may simulate flock() locking with posix locking which
- * requires the open mode to match the lock type.
- */
- switch (fl->fl_type) {
- case F_UNLCK:
+ /* We're simulating flock() locks using posix locks on the server */
+ if (fl->fl_type == F_UNLCK)
return do_unlk(filp, cmd, fl, is_local);
- case F_RDLCK:
- if (!(filp->f_mode & FMODE_READ))
- return -EBADF;
- break;
- case F_WRLCK:
- if (!(filp->f_mode & FMODE_WRITE))
- return -EBADF;
- }
-
return do_setlk(filp, cmd, fl, is_local);
}
EXPORT_SYMBOL_GPL(nfs_flock);
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 508126eb49f9..4e54d8b5413a 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -471,10 +471,10 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr)
return PNFS_NOT_ATTEMPTED;
dprintk("%s USE DS: %s cl_count %d\n", __func__,
- ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
+ ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count));
/* No multipath support. Use first DS */
- atomic_inc(&ds->ds_clp->cl_count);
+ refcount_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
hdr->ds_commit_idx = idx;
fh = nfs4_fl_select_ds_fh(lseg, j);
@@ -515,10 +515,10 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d\n",
__func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
- offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
+ offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count));
hdr->pgio_done_cb = filelayout_write_done_cb;
- atomic_inc(&ds->ds_clp->cl_count);
+ refcount_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
hdr->ds_commit_idx = idx;
fh = nfs4_fl_select_ds_fh(lseg, j);
@@ -1064,9 +1064,9 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
goto out_err;
dprintk("%s ino %lu, how %d cl_count %d\n", __func__,
- data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count));
+ data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count));
data->commit_done_cb = filelayout_commit_done_cb;
- atomic_inc(&ds->ds_clp->cl_count);
+ refcount_inc(&ds->ds_clp->cl_count);
data->ds_clp = ds->ds_clp;
fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
if (fh)
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index b0fa83a60754..c75ad982bcfc 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -187,7 +187,7 @@ ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
continue;
if (!ff_mirror_match_fh(mirror, pos))
continue;
- if (atomic_inc_not_zero(&pos->ref)) {
+ if (refcount_inc_not_zero(&pos->ref)) {
spin_unlock(&inode->i_lock);
return pos;
}
@@ -218,7 +218,7 @@ static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
mirror = kzalloc(sizeof(*mirror), gfp_flags);
if (mirror != NULL) {
spin_lock_init(&mirror->lock);
- atomic_set(&mirror->ref, 1);
+ refcount_set(&mirror->ref, 1);
INIT_LIST_HEAD(&mirror->mirrors);
}
return mirror;
@@ -242,7 +242,7 @@ static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror)
{
- if (mirror != NULL && atomic_dec_and_test(&mirror->ref))
+ if (mirror != NULL && refcount_dec_and_test(&mirror->ref))
ff_layout_free_mirror(mirror);
}
@@ -1726,10 +1726,10 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
vers = nfs4_ff_layout_ds_version(lseg, idx);
dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
- ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers);
+ ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers);
hdr->pgio_done_cb = ff_layout_read_done_cb;
- atomic_inc(&ds->ds_clp->cl_count);
+ refcount_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
if (fh)
@@ -1785,11 +1785,11 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n",
__func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
- offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count),
+ offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count),
vers);
hdr->pgio_done_cb = ff_layout_write_done_cb;
- atomic_inc(&ds->ds_clp->cl_count);
+ refcount_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
hdr->ds_commit_idx = idx;
fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
@@ -1863,11 +1863,11 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
vers = nfs4_ff_layout_ds_version(lseg, idx);
dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
- data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count),
+ data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count),
vers);
data->commit_done_cb = ff_layout_commit_done_cb;
data->cred = ds_cred;
- atomic_inc(&ds->ds_clp->cl_count);
+ refcount_inc(&ds->ds_clp->cl_count);
data->ds_clp = ds->ds_clp;
fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
if (fh)
@@ -2286,7 +2286,7 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags))
continue;
/* mirror refcount put in cleanup_layoutstats */
- if (!atomic_inc_not_zero(&mirror->ref))
+ if (!refcount_inc_not_zero(&mirror->ref))
continue;
dev = &mirror->mirror_ds->id_node;
memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 679cb087ef3f..411798346e48 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -14,6 +14,7 @@
#define FF_FLAGS_NO_IO_THRU_MDS 2
#define FF_FLAGS_NO_READ_IO 4
+#include <linux/refcount.h>
#include "../pnfs.h"
/* XXX: Let's filter out insanely large mirror count for now to avoid oom
@@ -82,7 +83,7 @@ struct nfs4_ff_layout_mirror {
nfs4_stateid stateid;
struct rpc_cred __rcu *ro_cred;
struct rpc_cred __rcu *rw_cred;
- atomic_t ref;
+ refcount_t ref;
spinlock_t lock;
unsigned long flags;
struct nfs4_ff_layoutstat read_stat;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 1629056aa2c9..38b93d54c02e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -783,7 +783,7 @@ EXPORT_SYMBOL_GPL(nfs_getattr);
static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
{
- atomic_set(&l_ctx->count, 1);
+ refcount_set(&l_ctx->count, 1);
l_ctx->lockowner = current->files;
INIT_LIST_HEAD(&l_ctx->list);
atomic_set(&l_ctx->io_count, 0);
@@ -797,7 +797,7 @@ static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context
do {
if (pos->lockowner != current->files)
continue;
- atomic_inc(&pos->count);
+ refcount_inc(&pos->count);
return pos;
} while ((pos = list_entry(pos->list.next, typeof(*pos), list)) != head);
return NULL;
@@ -836,7 +836,7 @@ void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
struct nfs_open_context *ctx = l_ctx->open_context;
struct inode *inode = d_inode(ctx->dentry);
- if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock))
+ if (!refcount_dec_and_lock(&l_ctx->count, &inode->i_lock))
return;
list_del(&l_ctx->list);
spin_unlock(&inode->i_lock);
@@ -913,7 +913,7 @@ EXPORT_SYMBOL_GPL(alloc_nfs_open_context);
struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
{
if (ctx != NULL)
- atomic_inc(&ctx->lock_context.count);
+ refcount_inc(&ctx->lock_context.count);
return ctx;
}
EXPORT_SYMBOL_GPL(get_nfs_open_context);
@@ -924,11 +924,11 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
struct super_block *sb = ctx->dentry->d_sb;
if (!list_empty(&ctx->list)) {
- if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
+ if (!refcount_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
return;
list_del(&ctx->list);
spin_unlock(&inode->i_lock);
- } else if (!atomic_dec_and_test(&ctx->lock_context.count))
+ } else if (!refcount_dec_and_test(&ctx->lock_context.count))
return;
if (inode != NULL)
NFS_PROTO(inode)->close_context(ctx, is_sync);
@@ -2084,8 +2084,12 @@ static int nfs_net_init(struct net *net)
static void nfs_net_exit(struct net *net)
{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
nfs_fs_proc_net_exit(net);
nfs_cleanup_cb_ident_idr(net);
+ WARN_ON_ONCE(!list_empty(&nn->nfs_client_list));
+ WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list));
}
static struct pernet_operations nfs_net_ops = {
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index bc673fb47fb3..49f848fd1f04 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -188,6 +188,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
{
struct nfs3_accessargs arg = {
.fh = NFS_FH(inode),
+ .access = entry->mask,
};
struct nfs3_accessres res;
struct rpc_message msg = {
@@ -196,25 +197,9 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
.rpc_resp = &res,
.rpc_cred = entry->cred,
};
- int mode = entry->mask;
int status = -ENOMEM;
dprintk("NFS call access\n");
-
- if (mode & MAY_READ)
- arg.access |= NFS3_ACCESS_READ;
- if (S_ISDIR(inode->i_mode)) {
- if (mode & MAY_WRITE)
- arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE;
- if (mode & MAY_EXEC)
- arg.access |= NFS3_ACCESS_LOOKUP;
- } else {
- if (mode & MAY_WRITE)
- arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND;
- if (mode & MAY_EXEC)
- arg.access |= NFS3_ACCESS_EXECUTE;
- }
-
res.fattr = nfs_alloc_fattr();
if (res.fattr == NULL)
goto out;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index dcfcf7fd7438..b374f680830c 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -145,7 +145,7 @@ struct nfs4_lock_state {
unsigned long ls_flags;
struct nfs_seqid_counter ls_seqid;
nfs4_stateid ls_stateid;
- atomic_t ls_count;
+ refcount_t ls_count;
fl_owner_t ls_owner;
};
@@ -162,6 +162,7 @@ enum {
NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */
NFS_STATE_RECOVERY_FAILED, /* OPEN stateid state recovery failed */
NFS_STATE_MAY_NOTIFY_LOCK, /* server may CB_NOTIFY_LOCK */
+ NFS_STATE_CHANGE_WAIT, /* A state changing operation is outstanding */
};
struct nfs4_state {
@@ -185,6 +186,8 @@ struct nfs4_state {
unsigned int n_rdwr; /* Number of read/write references */
fmode_t state; /* State on the server (R,W, or RW) */
atomic_t count;
+
+ wait_queue_head_t waitq;
};
@@ -458,6 +461,10 @@ extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t,
const struct nfs_lock_context *, nfs4_stateid *,
struct rpc_cred **);
+extern bool nfs4_refresh_open_stateid(nfs4_stateid *dst,
+ struct nfs4_state *state);
+extern bool nfs4_copy_open_stateid(nfs4_stateid *dst,
+ struct nfs4_state *state);
extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
@@ -465,7 +472,7 @@ extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
extern void nfs_release_seqid(struct nfs_seqid *seqid);
extern void nfs_free_seqid(struct nfs_seqid *seqid);
-extern int nfs4_setup_sequence(const struct nfs_client *client,
+extern int nfs4_setup_sequence(struct nfs_client *client,
struct nfs4_sequence_args *args,
struct nfs4_sequence_res *res,
struct rpc_task *task);
@@ -475,6 +482,7 @@ extern int nfs4_sequence_done(struct rpc_task *task,
extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp);
extern const nfs4_stateid zero_stateid;
+extern const nfs4_stateid invalid_stateid;
/* nfs4super.c */
struct nfs_mount_info;
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index e9bea90dc017..12bbab0becb4 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -483,7 +483,7 @@ static int nfs4_match_client(struct nfs_client *pos, struct nfs_client *new,
* ID and serverowner fields. Wait for CREATE_SESSION
* to finish. */
if (pos->cl_cons_state > NFS_CS_READY) {
- atomic_inc(&pos->cl_count);
+ refcount_inc(&pos->cl_count);
spin_unlock(&nn->nfs_client_lock);
nfs_put_client(*prev);
@@ -559,7 +559,7 @@ int nfs40_walk_client_list(struct nfs_client *new,
* way that a SETCLIENTID_CONFIRM to pos can succeed is
* if new and pos point to the same server:
*/
- atomic_inc(&pos->cl_count);
+ refcount_inc(&pos->cl_count);
spin_unlock(&nn->nfs_client_lock);
nfs_put_client(prev);
@@ -715,7 +715,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
continue;
found:
- atomic_inc(&pos->cl_count);
+ refcount_inc(&pos->cl_count);
*result = pos;
status = 0;
break;
@@ -749,7 +749,7 @@ nfs4_find_client_ident(struct net *net, int cb_ident)
spin_lock(&nn->nfs_client_lock);
clp = idr_find(&nn->cb_ident_idr, cb_ident);
if (clp)
- atomic_inc(&clp->cl_count);
+ refcount_inc(&clp->cl_count);
spin_unlock(&nn->nfs_client_lock);
return clp;
}
@@ -793,7 +793,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
spin_lock(&nn->nfs_client_lock);
list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
- if (nfs4_cb_match_client(addr, clp, minorversion) == false)
+ if (!nfs4_cb_match_client(addr, clp, minorversion))
continue;
if (!nfs4_has_session(clp))
@@ -804,7 +804,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
sid->data, NFS4_MAX_SESSIONID_LEN) != 0)
continue;
- atomic_inc(&clp->cl_count);
+ refcount_inc(&clp->cl_count);
spin_unlock(&nn->nfs_client_lock);
return clp;
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f90090e8c959..56fa5a16e097 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -96,6 +96,10 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs_open_context *ctx, struct nfs4_label *ilabel,
struct nfs4_label *olabel);
#ifdef CONFIG_NFS_V4_1
+static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
+ struct rpc_cred *cred,
+ struct nfs4_slot *slot,
+ bool is_privileged);
static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
struct rpc_cred *);
static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *,
@@ -254,15 +258,12 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
};
const u32 nfs4_fs_locations_bitmap[3] = {
- FATTR4_WORD0_TYPE
- | FATTR4_WORD0_CHANGE
+ FATTR4_WORD0_CHANGE
| FATTR4_WORD0_SIZE
| FATTR4_WORD0_FSID
| FATTR4_WORD0_FILEID
| FATTR4_WORD0_FS_LOCATIONS,
- FATTR4_WORD1_MODE
- | FATTR4_WORD1_NUMLINKS
- | FATTR4_WORD1_OWNER
+ FATTR4_WORD1_OWNER
| FATTR4_WORD1_OWNER_GROUP
| FATTR4_WORD1_RAWDEV
| FATTR4_WORD1_SPACE_USED
@@ -644,13 +645,14 @@ static int nfs40_sequence_done(struct rpc_task *task,
#if defined(CONFIG_NFS_V4_1)
-static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
+static void nfs41_release_slot(struct nfs4_slot *slot)
{
struct nfs4_session *session;
struct nfs4_slot_table *tbl;
- struct nfs4_slot *slot = res->sr_slot;
bool send_new_highest_used_slotid = false;
+ if (!slot)
+ return;
tbl = slot->table;
session = tbl->session;
@@ -676,13 +678,18 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
send_new_highest_used_slotid = false;
out_unlock:
spin_unlock(&tbl->slot_tbl_lock);
- res->sr_slot = NULL;
if (send_new_highest_used_slotid)
nfs41_notify_server(session->clp);
if (waitqueue_active(&tbl->slot_waitq))
wake_up_all(&tbl->slot_waitq);
}
+static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
+{
+ nfs41_release_slot(res->sr_slot);
+ res->sr_slot = NULL;
+}
+
static int nfs41_sequence_process(struct rpc_task *task,
struct nfs4_sequence_res *res)
{
@@ -710,13 +717,6 @@ static int nfs41_sequence_process(struct rpc_task *task,
/* Check the SEQUENCE operation status */
switch (res->sr_status) {
case 0:
- /* If previous op on slot was interrupted and we reused
- * the seq# and got a reply from the cache, then retry
- */
- if (task->tk_status == -EREMOTEIO && interrupted) {
- ++slot->seq_nr;
- goto retry_nowait;
- }
/* Update the slot's sequence and clientid lease timer */
slot->seq_done = 1;
clp = session->clp;
@@ -750,16 +750,16 @@ static int nfs41_sequence_process(struct rpc_task *task,
* The slot id we used was probably retired. Try again
* using a different slot id.
*/
+ if (slot->seq_nr < slot->table->target_highest_slotid)
+ goto session_recover;
goto retry_nowait;
case -NFS4ERR_SEQ_MISORDERED:
/*
* Was the last operation on this sequence interrupted?
* If so, retry after bumping the sequence number.
*/
- if (interrupted) {
- ++slot->seq_nr;
- goto retry_nowait;
- }
+ if (interrupted)
+ goto retry_new_seq;
/*
* Could this slot have been previously retired?
* If so, then the server may be expecting seq_nr = 1!
@@ -768,10 +768,11 @@ static int nfs41_sequence_process(struct rpc_task *task,
slot->seq_nr = 1;
goto retry_nowait;
}
- break;
+ goto session_recover;
case -NFS4ERR_SEQ_FALSE_RETRY:
- ++slot->seq_nr;
- goto retry_nowait;
+ if (interrupted)
+ goto retry_new_seq;
+ goto session_recover;
default:
/* Just update the slot sequence no. */
slot->seq_done = 1;
@@ -781,6 +782,11 @@ out:
dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
out_noaction:
return ret;
+session_recover:
+ nfs4_schedule_session_recovery(session, res->sr_status);
+ goto retry_nowait;
+retry_new_seq:
+ ++slot->seq_nr;
retry_nowait:
if (rpc_restart_call_prepare(task)) {
nfs41_sequence_free_slot(res);
@@ -857,6 +863,17 @@ static const struct rpc_call_ops nfs41_call_sync_ops = {
.rpc_call_done = nfs41_call_sync_done,
};
+static void
+nfs4_sequence_process_interrupted(struct nfs_client *client,
+ struct nfs4_slot *slot, struct rpc_cred *cred)
+{
+ struct rpc_task *task;
+
+ task = _nfs41_proc_sequence(client, cred, slot, true);
+ if (!IS_ERR(task))
+ rpc_put_task_async(task);
+}
+
#else /* !CONFIG_NFS_V4_1 */
static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res)
@@ -877,9 +894,34 @@ int nfs4_sequence_done(struct rpc_task *task,
}
EXPORT_SYMBOL_GPL(nfs4_sequence_done);
+static void
+nfs4_sequence_process_interrupted(struct nfs_client *client,
+ struct nfs4_slot *slot, struct rpc_cred *cred)
+{
+ WARN_ON_ONCE(1);
+ slot->interrupted = 0;
+}
+
#endif /* !CONFIG_NFS_V4_1 */
-int nfs4_setup_sequence(const struct nfs_client *client,
+static
+void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res,
+ struct nfs4_slot *slot)
+{
+ if (!slot)
+ return;
+ slot->privileged = args->sa_privileged ? 1 : 0;
+ args->sa_slot = slot;
+
+ res->sr_slot = slot;
+ res->sr_timestamp = jiffies;
+ res->sr_status_flags = 0;
+ res->sr_status = 1;
+
+}
+
+int nfs4_setup_sequence(struct nfs_client *client,
struct nfs4_sequence_args *args,
struct nfs4_sequence_res *res,
struct rpc_task *task)
@@ -897,29 +939,28 @@ int nfs4_setup_sequence(const struct nfs_client *client,
task->tk_timeout = 0;
}
- spin_lock(&tbl->slot_tbl_lock);
- /* The state manager will wait until the slot table is empty */
- if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
- goto out_sleep;
+ for (;;) {
+ spin_lock(&tbl->slot_tbl_lock);
+ /* The state manager will wait until the slot table is empty */
+ if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
+ goto out_sleep;
+
+ slot = nfs4_alloc_slot(tbl);
+ if (IS_ERR(slot)) {
+ /* Try again in 1/4 second */
+ if (slot == ERR_PTR(-ENOMEM))
+ task->tk_timeout = HZ >> 2;
+ goto out_sleep;
+ }
+ spin_unlock(&tbl->slot_tbl_lock);
- slot = nfs4_alloc_slot(tbl);
- if (IS_ERR(slot)) {
- /* Try again in 1/4 second */
- if (slot == ERR_PTR(-ENOMEM))
- task->tk_timeout = HZ >> 2;
- goto out_sleep;
+ if (likely(!slot->interrupted))
+ break;
+ nfs4_sequence_process_interrupted(client,
+ slot, task->tk_msg.rpc_cred);
}
- spin_unlock(&tbl->slot_tbl_lock);
-
- slot->privileged = args->sa_privileged ? 1 : 0;
- args->sa_slot = slot;
- res->sr_slot = slot;
- if (session) {
- res->sr_timestamp = jiffies;
- res->sr_status_flags = 0;
- res->sr_status = 1;
- }
+ nfs4_sequence_attach_slot(args, res, slot);
trace_nfs4_setup_sequence(session, args);
out_start:
@@ -1044,6 +1085,12 @@ struct nfs4_opendata {
int rpc_status;
};
+struct nfs4_open_createattrs {
+ struct nfs4_label *label;
+ struct iattr *sattr;
+ const __u32 verf[2];
+};
+
static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server,
int err, struct nfs4_exception *exception)
{
@@ -1113,8 +1160,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
struct nfs4_state_owner *sp, fmode_t fmode, int flags,
- const struct iattr *attrs,
- struct nfs4_label *label,
+ const struct nfs4_open_createattrs *c,
enum open_claim_type4 claim,
gfp_t gfp_mask)
{
@@ -1122,6 +1168,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
struct inode *dir = d_inode(parent);
struct nfs_server *server = NFS_SERVER(dir);
struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+ struct nfs4_label *label = (c != NULL) ? c->label : NULL;
struct nfs4_opendata *p;
p = kzalloc(sizeof(*p), gfp_mask);
@@ -1187,15 +1234,11 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
p->o_arg.fh = NFS_FH(d_inode(dentry));
}
- if (attrs != NULL && attrs->ia_valid != 0) {
- __u32 verf[2];
-
+ if (c != NULL && c->sattr != NULL && c->sattr->ia_valid != 0) {
p->o_arg.u.attrs = &p->attrs;
- memcpy(&p->attrs, attrs, sizeof(p->attrs));
+ memcpy(&p->attrs, c->sattr, sizeof(p->attrs));
- verf[0] = jiffies;
- verf[1] = current->pid;
- memcpy(p->o_arg.u.verifier.data, verf,
+ memcpy(p->o_arg.u.verifier.data, c->verf,
sizeof(p->o_arg.u.verifier.data));
}
p->c_arg.fh = &p->o_res.fh;
@@ -1334,6 +1377,25 @@ static bool nfs_open_stateid_recover_openmode(struct nfs4_state *state)
}
#endif /* CONFIG_NFS_V4_1 */
+static void nfs_state_log_update_open_stateid(struct nfs4_state *state)
+{
+ if (test_and_clear_bit(NFS_STATE_CHANGE_WAIT, &state->flags))
+ wake_up_all(&state->waitq);
+}
+
+static void nfs_state_log_out_of_order_open_stateid(struct nfs4_state *state,
+ const nfs4_stateid *stateid)
+{
+ u32 state_seqid = be32_to_cpu(state->open_stateid.seqid);
+ u32 stateid_seqid = be32_to_cpu(stateid->seqid);
+
+ if (stateid_seqid == state_seqid + 1U ||
+ (stateid_seqid == 1U && state_seqid == 0xffffffffU))
+ nfs_state_log_update_open_stateid(state);
+ else
+ set_bit(NFS_STATE_CHANGE_WAIT, &state->flags);
+}
+
static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
{
struct nfs_client *clp = state->owner->so_server->nfs_client;
@@ -1349,18 +1411,32 @@ static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
nfs4_state_mark_reclaim_nograce(clp, state);
}
+/*
+ * Check for whether or not the caller may update the open stateid
+ * to the value passed in by stateid.
+ *
+ * Note: This function relies heavily on the server implementing
+ * RFC7530 Section 9.1.4.2, and RFC5661 Section 8.2.2
+ * correctly.
+ * i.e. The stateid seqids have to be initialised to 1, and
+ * are then incremented on every state transition.
+ */
static bool nfs_need_update_open_stateid(struct nfs4_state *state,
- const nfs4_stateid *stateid, nfs4_stateid *freeme)
+ const nfs4_stateid *stateid)
{
- if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0)
- return true;
- if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) {
- nfs4_stateid_copy(freeme, &state->open_stateid);
- nfs_test_and_clear_all_open_stateid(state);
+ if (test_bit(NFS_OPEN_STATE, &state->flags) == 0 ||
+ !nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+ if (stateid->seqid == cpu_to_be32(1))
+ nfs_state_log_update_open_stateid(state);
+ else
+ set_bit(NFS_STATE_CHANGE_WAIT, &state->flags);
return true;
}
- if (nfs4_stateid_is_newer(stateid, &state->open_stateid))
+
+ if (nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
+ nfs_state_log_out_of_order_open_stateid(state, stateid);
return true;
+ }
return false;
}
@@ -1399,11 +1475,14 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
if (nfs4_stateid_match_other(stateid, &state->open_stateid) &&
!nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
nfs_resync_open_stateid_locked(state);
- return;
+ goto out;
}
if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
nfs4_stateid_copy(&state->stateid, stateid);
nfs4_stateid_copy(&state->open_stateid, stateid);
+ trace_nfs4_open_stateid_update(state->inode, stateid, 0);
+out:
+ nfs_state_log_update_open_stateid(state);
}
static void nfs_clear_open_stateid(struct nfs4_state *state,
@@ -1420,29 +1499,60 @@ static void nfs_clear_open_stateid(struct nfs4_state *state,
}
static void nfs_set_open_stateid_locked(struct nfs4_state *state,
- const nfs4_stateid *stateid, fmode_t fmode,
- nfs4_stateid *freeme)
+ const nfs4_stateid *stateid, nfs4_stateid *freeme)
{
- switch (fmode) {
- case FMODE_READ:
- set_bit(NFS_O_RDONLY_STATE, &state->flags);
+ DEFINE_WAIT(wait);
+ int status = 0;
+ for (;;) {
+
+ if (!nfs_need_update_open_stateid(state, stateid))
+ return;
+ if (!test_bit(NFS_STATE_CHANGE_WAIT, &state->flags))
break;
- case FMODE_WRITE:
- set_bit(NFS_O_WRONLY_STATE, &state->flags);
+ if (status)
break;
- case FMODE_READ|FMODE_WRITE:
- set_bit(NFS_O_RDWR_STATE, &state->flags);
+ /* Rely on seqids for serialisation with NFSv4.0 */
+ if (!nfs4_has_session(NFS_SERVER(state->inode)->nfs_client))
+ break;
+
+ prepare_to_wait(&state->waitq, &wait, TASK_KILLABLE);
+ /*
+ * Ensure we process the state changes in the same order
+ * in which the server processed them by delaying the
+ * update of the stateid until we are in sequence.
+ */
+ write_sequnlock(&state->seqlock);
+ spin_unlock(&state->owner->so_lock);
+ rcu_read_unlock();
+ trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0);
+ if (!signal_pending(current)) {
+ if (schedule_timeout(5*HZ) == 0)
+ status = -EAGAIN;
+ else
+ status = 0;
+ } else
+ status = -EINTR;
+ finish_wait(&state->waitq, &wait);
+ rcu_read_lock();
+ spin_lock(&state->owner->so_lock);
+ write_seqlock(&state->seqlock);
}
- if (!nfs_need_update_open_stateid(state, stateid, freeme))
- return;
+
+ if (test_bit(NFS_OPEN_STATE, &state->flags) &&
+ !nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+ nfs4_stateid_copy(freeme, &state->open_stateid);
+ nfs_test_and_clear_all_open_stateid(state);
+ }
+
if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
nfs4_stateid_copy(&state->stateid, stateid);
nfs4_stateid_copy(&state->open_stateid, stateid);
+ trace_nfs4_open_stateid_update(state->inode, stateid, status);
+ nfs_state_log_update_open_stateid(state);
}
-static void __update_open_stateid(struct nfs4_state *state,
+static void nfs_state_set_open_stateid(struct nfs4_state *state,
const nfs4_stateid *open_stateid,
- const nfs4_stateid *deleg_stateid,
fmode_t fmode,
nfs4_stateid *freeme)
{
@@ -1450,17 +1560,34 @@ static void __update_open_stateid(struct nfs4_state *state,
* Protect the call to nfs4_state_set_mode_locked and
* serialise the stateid update
*/
- spin_lock(&state->owner->so_lock);
write_seqlock(&state->seqlock);
- if (deleg_stateid != NULL) {
- nfs4_stateid_copy(&state->stateid, deleg_stateid);
- set_bit(NFS_DELEGATED_STATE, &state->flags);
+ nfs_set_open_stateid_locked(state, open_stateid, freeme);
+ switch (fmode) {
+ case FMODE_READ:
+ set_bit(NFS_O_RDONLY_STATE, &state->flags);
+ break;
+ case FMODE_WRITE:
+ set_bit(NFS_O_WRONLY_STATE, &state->flags);
+ break;
+ case FMODE_READ|FMODE_WRITE:
+ set_bit(NFS_O_RDWR_STATE, &state->flags);
}
- if (open_stateid != NULL)
- nfs_set_open_stateid_locked(state, open_stateid, fmode, freeme);
+ set_bit(NFS_OPEN_STATE, &state->flags);
+ write_sequnlock(&state->seqlock);
+}
+
+static void nfs_state_set_delegation(struct nfs4_state *state,
+ const nfs4_stateid *deleg_stateid,
+ fmode_t fmode)
+{
+ /*
+ * Protect the call to nfs4_state_set_mode_locked and
+ * serialise the stateid update
+ */
+ write_seqlock(&state->seqlock);
+ nfs4_stateid_copy(&state->stateid, deleg_stateid);
+ set_bit(NFS_DELEGATED_STATE, &state->flags);
write_sequnlock(&state->seqlock);
- update_open_stateflags(state, fmode);
- spin_unlock(&state->owner->so_lock);
}
static int update_open_stateid(struct nfs4_state *state,
@@ -1478,6 +1605,12 @@ static int update_open_stateid(struct nfs4_state *state,
fmode &= (FMODE_READ|FMODE_WRITE);
rcu_read_lock();
+ spin_lock(&state->owner->so_lock);
+ if (open_stateid != NULL) {
+ nfs_state_set_open_stateid(state, open_stateid, fmode, &freeme);
+ ret = 1;
+ }
+
deleg_cur = rcu_dereference(nfsi->delegation);
if (deleg_cur == NULL)
goto no_delegation;
@@ -1494,18 +1627,16 @@ static int update_open_stateid(struct nfs4_state *state,
goto no_delegation_unlock;
nfs_mark_delegation_referenced(deleg_cur);
- __update_open_stateid(state, open_stateid, &deleg_cur->stateid,
- fmode, &freeme);
+ nfs_state_set_delegation(state, &deleg_cur->stateid, fmode);
ret = 1;
no_delegation_unlock:
spin_unlock(&deleg_cur->lock);
no_delegation:
+ if (ret)
+ update_open_stateflags(state, fmode);
+ spin_unlock(&state->owner->so_lock);
rcu_read_unlock();
- if (!ret && open_stateid != NULL) {
- __update_open_stateid(state, open_stateid, NULL, fmode, &freeme);
- ret = 1;
- }
if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
nfs4_schedule_state_manager(clp);
if (freeme.type != 0)
@@ -1761,7 +1892,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
struct nfs4_opendata *opendata;
opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0,
- NULL, NULL, claim, GFP_NOFS);
+ NULL, claim, GFP_NOFS);
if (opendata == NULL)
return ERR_PTR(-ENOMEM);
opendata->state = state;
@@ -2518,7 +2649,7 @@ static int nfs41_check_expired_locks(struct nfs4_state *state)
if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
- atomic_inc(&lsp->ls_count);
+ refcount_inc(&lsp->ls_count);
spin_unlock(&state->state_lock);
nfs4_put_lock_state(prev);
@@ -2692,8 +2823,7 @@ out:
static int _nfs4_do_open(struct inode *dir,
struct nfs_open_context *ctx,
int flags,
- struct iattr *sattr,
- struct nfs4_label *label,
+ const struct nfs4_open_createattrs *c,
int *opened)
{
struct nfs4_state_owner *sp;
@@ -2705,6 +2835,8 @@ static int _nfs4_do_open(struct inode *dir,
struct nfs4_threshold **ctx_th = &ctx->mdsthreshold;
fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC);
enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL;
+ struct iattr *sattr = c->sattr;
+ struct nfs4_label *label = c->label;
struct nfs4_label *olabel = NULL;
int status;
@@ -2723,8 +2855,8 @@ static int _nfs4_do_open(struct inode *dir,
status = -ENOMEM;
if (d_really_is_positive(dentry))
claim = NFS4_OPEN_CLAIM_FH;
- opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr,
- label, claim, GFP_KERNEL);
+ opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags,
+ c, claim, GFP_KERNEL);
if (opendata == NULL)
goto err_put_state_owner;
@@ -2805,10 +2937,18 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
struct nfs_server *server = NFS_SERVER(dir);
struct nfs4_exception exception = { };
struct nfs4_state *res;
+ struct nfs4_open_createattrs c = {
+ .label = label,
+ .sattr = sattr,
+ .verf = {
+ [0] = (__u32)jiffies,
+ [1] = (__u32)current->pid,
+ },
+ };
int status;
do {
- status = _nfs4_do_open(dir, ctx, flags, sattr, label, opened);
+ status = _nfs4_do_open(dir, ctx, flags, &c, opened);
res = ctx->state;
trace_nfs4_open_file(ctx, flags, status);
if (status == 0)
@@ -3024,18 +3164,20 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
calldata->arg.lr_args = NULL;
calldata->res.lr_res = NULL;
break;
+ case -NFS4ERR_OLD_STATEID:
+ if (nfs4_refresh_layout_stateid(&calldata->arg.lr_args->stateid,
+ calldata->inode))
+ goto lr_restart;
+ /* Fallthrough */
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_EXPIRED:
case -NFS4ERR_BAD_STATEID:
- case -NFS4ERR_OLD_STATEID:
case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
case -NFS4ERR_WRONG_CRED:
calldata->arg.lr_args = NULL;
calldata->res.lr_res = NULL;
- calldata->res.lr_ret = 0;
- rpc_restart_call_prepare(task);
- return;
+ goto lr_restart;
}
}
@@ -3051,39 +3193,43 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
if (calldata->arg.bitmask != NULL) {
calldata->arg.bitmask = NULL;
calldata->res.fattr = NULL;
- task->tk_status = 0;
- rpc_restart_call_prepare(task);
- goto out_release;
+ goto out_restart;
}
break;
+ case -NFS4ERR_OLD_STATEID:
+ /* Did we race with OPEN? */
+ if (nfs4_refresh_open_stateid(&calldata->arg.stateid,
+ state))
+ goto out_restart;
+ goto out_release;
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_EXPIRED:
nfs4_free_revoked_stateid(server,
&calldata->arg.stateid,
task->tk_msg.rpc_cred);
- case -NFS4ERR_OLD_STATEID:
+ /* Fallthrough */
case -NFS4ERR_BAD_STATEID:
- if (!nfs4_stateid_match(&calldata->arg.stateid,
- &state->open_stateid)) {
- rpc_restart_call_prepare(task);
- goto out_release;
- }
- if (calldata->arg.fmode == 0)
- break;
+ break;
default:
- if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) {
- rpc_restart_call_prepare(task);
- goto out_release;
- }
+ if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
+ goto out_restart;
}
nfs_clear_open_stateid(state, &calldata->arg.stateid,
res_stateid, calldata->arg.fmode);
out_release:
+ task->tk_status = 0;
nfs_release_seqid(calldata->arg.seqid);
nfs_refresh_inode(calldata->inode, &calldata->fattr);
dprintk("%s: done, ret = %d!\n", __func__, task->tk_status);
+ return;
+lr_restart:
+ calldata->res.lr_ret = 0;
+out_restart:
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
+ goto out_release;
}
static void nfs4_close_prepare(struct rpc_task *task, void *data)
@@ -3103,7 +3249,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
- nfs4_stateid_copy(&calldata->arg.stateid, &state->open_stateid);
/* Calculate the change in open mode */
calldata->arg.fmode = 0;
if (state->n_rdwr == 0) {
@@ -3121,7 +3266,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
calldata->arg.fmode |= FMODE_READ|FMODE_WRITE;
if (!nfs4_valid_open_stateid(state) ||
- test_bit(NFS_OPEN_STATE, &state->flags) == 0)
+ !nfs4_refresh_open_stateid(&calldata->arg.stateid, state))
call_close = 0;
spin_unlock(&state->owner->so_lock);
@@ -3215,6 +3360,8 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
calldata->inode = state->inode;
calldata->state = state;
calldata->arg.fh = NFS_FH(state->inode);
+ if (!nfs4_copy_open_stateid(&calldata->arg.stateid, state))
+ goto out_free_calldata;
/* Serialization for the sequence id */
alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
calldata->arg.seqid = alloc_seqid(&state->owner->so_seqid, gfp_mask);
@@ -3889,6 +4036,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
struct nfs4_accessargs args = {
.fh = NFS_FH(inode),
.bitmask = server->cache_consistency_bitmask,
+ .access = entry->mask,
};
struct nfs4_accessres res = {
.server = server,
@@ -3899,26 +4047,8 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
.rpc_resp = &res,
.rpc_cred = entry->cred,
};
- int mode = entry->mask;
int status = 0;
- /*
- * Determine which access bits we want to ask for...
- */
- if (mode & MAY_READ)
- args.access |= NFS4_ACCESS_READ;
- if (S_ISDIR(inode->i_mode)) {
- if (mode & MAY_WRITE)
- args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE;
- if (mode & MAY_EXEC)
- args.access |= NFS4_ACCESS_LOOKUP;
- } else {
- if (mode & MAY_WRITE)
- args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND;
- if (mode & MAY_EXEC)
- args.access |= NFS4_ACCESS_EXECUTE;
- }
-
res.fattr = nfs_alloc_fattr();
if (res.fattr == NULL)
return -ENOMEM;
@@ -4843,7 +4973,7 @@ static void nfs4_renew_release(void *calldata)
struct nfs4_renewdata *data = calldata;
struct nfs_client *clp = data->client;
- if (atomic_read(&clp->cl_count) > 1)
+ if (refcount_read(&clp->cl_count) > 1)
nfs4_schedule_state_renewal(clp);
nfs_put_client(clp);
kfree(data);
@@ -4891,7 +5021,7 @@ static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred,
if (renew_flags == 0)
return 0;
- if (!atomic_inc_not_zero(&clp->cl_count))
+ if (!refcount_inc_not_zero(&clp->cl_count))
return -EIO;
data = kmalloc(sizeof(*data), GFP_NOFS);
if (data == NULL) {
@@ -5643,18 +5773,20 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
data->args.lr_args = NULL;
data->res.lr_res = NULL;
break;
+ case -NFS4ERR_OLD_STATEID:
+ if (nfs4_refresh_layout_stateid(&data->args.lr_args->stateid,
+ data->inode))
+ goto lr_restart;
+ /* Fallthrough */
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_EXPIRED:
case -NFS4ERR_BAD_STATEID:
- case -NFS4ERR_OLD_STATEID:
case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
case -NFS4ERR_WRONG_CRED:
data->args.lr_args = NULL;
data->res.lr_res = NULL;
- data->res.lr_ret = 0;
- rpc_restart_call_prepare(task);
- return;
+ goto lr_restart;
}
}
@@ -5668,27 +5800,36 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
nfs4_free_revoked_stateid(data->res.server,
data->args.stateid,
task->tk_msg.rpc_cred);
+ /* Fallthrough */
case -NFS4ERR_BAD_STATEID:
- case -NFS4ERR_OLD_STATEID:
case -NFS4ERR_STALE_STATEID:
task->tk_status = 0;
break;
+ case -NFS4ERR_OLD_STATEID:
+ if (nfs4_refresh_delegation_stateid(&data->stateid, data->inode))
+ goto out_restart;
+ task->tk_status = 0;
+ break;
case -NFS4ERR_ACCESS:
if (data->args.bitmask) {
data->args.bitmask = NULL;
data->res.fattr = NULL;
- task->tk_status = 0;
- rpc_restart_call_prepare(task);
- return;
+ goto out_restart;
}
+ /* Fallthrough */
default:
if (nfs4_async_handle_error(task, data->res.server,
NULL, NULL) == -EAGAIN) {
- rpc_restart_call_prepare(task);
- return;
+ goto out_restart;
}
}
data->rpc_status = task->tk_status;
+ return;
+lr_restart:
+ data->res.lr_ret = 0;
+out_restart:
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
}
static void nfs4_delegreturn_release(void *calldata)
@@ -5896,7 +6037,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
p->arg.seqid = seqid;
p->res.seqid = seqid;
p->lsp = lsp;
- atomic_inc(&lsp->ls_count);
+ refcount_inc(&lsp->ls_count);
/* Ensure we don't close file until we're done freeing locks! */
p->ctx = get_nfs_open_context(ctx);
p->l_ctx = nfs_get_lock_context(ctx);
@@ -6112,7 +6253,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
p->res.lock_seqid = p->arg.lock_seqid;
p->lsp = lsp;
p->server = server;
- atomic_inc(&lsp->ls_count);
+ refcount_inc(&lsp->ls_count);
p->ctx = get_nfs_open_context(ctx);
memcpy(&p->fl, fl, sizeof(p->fl));
return p;
@@ -6568,6 +6709,20 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
!test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
return -ENOLCK;
+ /*
+ * Don't rely on the VFS having checked the file open mode,
+ * since it won't do this for flock() locks.
+ */
+ switch (request->fl_type) {
+ case F_RDLCK:
+ if (!(filp->f_mode & FMODE_READ))
+ return -EBADF;
+ break;
+ case F_WRLCK:
+ if (!(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+ }
+
status = nfs4_set_lock_state(state, request);
if (status != 0)
return status;
@@ -6763,9 +6918,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
struct page *page)
{
struct nfs_server *server = NFS_SERVER(dir);
- u32 bitmask[3] = {
- [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
- };
+ u32 bitmask[3];
struct nfs4_fs_locations_arg args = {
.dir_fh = NFS_FH(dir),
.name = name,
@@ -6784,12 +6937,15 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
dprintk("%s: start\n", __func__);
+ bitmask[0] = nfs4_fattr_bitmap[0] | FATTR4_WORD0_FS_LOCATIONS;
+ bitmask[1] = nfs4_fattr_bitmap[1];
+
/* Ask for the fileid of the absent filesystem if mounted_on_fileid
* is not supported */
if (NFS_SERVER(dir)->attr_bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
- bitmask[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID;
+ bitmask[0] &= ~FATTR4_WORD0_FILEID;
else
- bitmask[0] |= FATTR4_WORD0_FILEID;
+ bitmask[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
nfs_fattr_init(&fs_locations->fattr);
fs_locations->server = server;
@@ -7472,7 +7628,7 @@ nfs4_run_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
struct nfs41_exchange_id_data *calldata;
int status;
- if (!atomic_inc_not_zero(&clp->cl_count))
+ if (!refcount_inc_not_zero(&clp->cl_count))
return ERR_PTR(-EIO);
status = -ENOMEM;
@@ -8072,7 +8228,7 @@ static void nfs41_sequence_release(void *data)
struct nfs4_sequence_data *calldata = data;
struct nfs_client *clp = calldata->clp;
- if (atomic_read(&clp->cl_count) > 1)
+ if (refcount_read(&clp->cl_count) > 1)
nfs4_schedule_state_renewal(clp);
nfs_put_client(clp);
kfree(calldata);
@@ -8101,7 +8257,7 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
trace_nfs4_sequence(clp, task->tk_status);
if (task->tk_status < 0) {
dprintk("%s ERROR %d\n", __func__, task->tk_status);
- if (atomic_read(&clp->cl_count) == 1)
+ if (refcount_read(&clp->cl_count) == 1)
goto out;
if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) {
@@ -8135,6 +8291,7 @@ static const struct rpc_call_ops nfs41_sequence_ops = {
static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
struct rpc_cred *cred,
+ struct nfs4_slot *slot,
bool is_privileged)
{
struct nfs4_sequence_data *calldata;
@@ -8148,15 +8305,18 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
.callback_ops = &nfs41_sequence_ops,
.flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT,
};
+ struct rpc_task *ret;
- if (!atomic_inc_not_zero(&clp->cl_count))
- return ERR_PTR(-EIO);
+ ret = ERR_PTR(-EIO);
+ if (!refcount_inc_not_zero(&clp->cl_count))
+ goto out_err;
+
+ ret = ERR_PTR(-ENOMEM);
calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
- if (calldata == NULL) {
- nfs_put_client(clp);
- return ERR_PTR(-ENOMEM);
- }
+ if (calldata == NULL)
+ goto out_put_clp;
nfs4_init_sequence(&calldata->args, &calldata->res, 0);
+ nfs4_sequence_attach_slot(&calldata->args, &calldata->res, slot);
if (is_privileged)
nfs4_set_sequence_privileged(&calldata->args);
msg.rpc_argp = &calldata->args;
@@ -8164,7 +8324,15 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
calldata->clp = clp;
task_setup_data.callback_data = calldata;
- return rpc_run_task(&task_setup_data);
+ ret = rpc_run_task(&task_setup_data);
+ if (IS_ERR(ret))
+ goto out_err;
+ return ret;
+out_put_clp:
+ nfs_put_client(clp);
+out_err:
+ nfs41_release_slot(slot);
+ return ret;
}
static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)
@@ -8174,7 +8342,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
return -EAGAIN;
- task = _nfs41_proc_sequence(clp, cred, false);
+ task = _nfs41_proc_sequence(clp, cred, NULL, false);
if (IS_ERR(task))
ret = PTR_ERR(task);
else
@@ -8188,7 +8356,7 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
struct rpc_task *task;
int ret;
- task = _nfs41_proc_sequence(clp, cred, true);
+ task = _nfs41_proc_sequence(clp, cred, NULL, true);
if (IS_ERR(task)) {
ret = PTR_ERR(task);
goto out;
@@ -8588,18 +8756,27 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
server = NFS_SERVER(lrp->args.inode);
switch (task->tk_status) {
+ case -NFS4ERR_OLD_STATEID:
+ if (nfs4_refresh_layout_stateid(&lrp->args.stateid,
+ lrp->args.inode))
+ goto out_restart;
+ /* Fallthrough */
default:
task->tk_status = 0;
+ /* Fallthrough */
case 0:
break;
case -NFS4ERR_DELAY:
if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
break;
- nfs4_sequence_free_slot(&lrp->res.seq_res);
- rpc_restart_call_prepare(task);
- return;
+ goto out_restart;
}
dprintk("<-- %s\n", __func__);
+ return;
+out_restart:
+ task->tk_status = 0;
+ nfs4_sequence_free_slot(&lrp->res.seq_res);
+ rpc_restart_call_prepare(task);
}
static void nfs4_layoutreturn_release(void *calldata)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0378e2257ca7..54fd56d715a8 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -69,6 +69,14 @@ const nfs4_stateid zero_stateid = {
{ .data = { 0 } },
.type = NFS4_SPECIAL_STATEID_TYPE,
};
+const nfs4_stateid invalid_stateid = {
+ {
+ .seqid = cpu_to_be32(0xffffffffU),
+ .other = { 0 },
+ },
+ .type = NFS4_INVALID_STATEID_TYPE,
+};
+
static DEFINE_MUTEX(nfs_clid_init_mutex);
int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
@@ -645,6 +653,7 @@ nfs4_alloc_open_state(void)
INIT_LIST_HEAD(&state->lock_states);
spin_lock_init(&state->state_lock);
seqlock_init(&state->seqlock);
+ init_waitqueue_head(&state->waitq);
return state;
}
@@ -825,7 +834,7 @@ __nfs4_find_lock_state(struct nfs4_state *state,
ret = pos;
}
if (ret)
- atomic_inc(&ret->ls_count);
+ refcount_inc(&ret->ls_count);
return ret;
}
@@ -843,7 +852,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
if (lsp == NULL)
return NULL;
nfs4_init_seqid_counter(&lsp->ls_seqid);
- atomic_set(&lsp->ls_count, 1);
+ refcount_set(&lsp->ls_count, 1);
lsp->ls_state = state;
lsp->ls_owner = fl_owner;
lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS);
@@ -907,7 +916,7 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
if (lsp == NULL)
return;
state = lsp->ls_state;
- if (!atomic_dec_and_lock(&lsp->ls_count, &state->state_lock))
+ if (!refcount_dec_and_lock(&lsp->ls_count, &state->state_lock))
return;
list_del(&lsp->ls_locks);
if (list_empty(&state->lock_states))
@@ -927,7 +936,7 @@ static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
struct nfs4_lock_state *lsp = src->fl_u.nfs4_fl.owner;
dst->fl_u.nfs4_fl.owner = lsp;
- atomic_inc(&lsp->ls_count);
+ refcount_inc(&lsp->ls_count);
}
static void nfs4_fl_release_lock(struct file_lock *fl)
@@ -985,18 +994,39 @@ out:
return ret;
}
-static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
+bool nfs4_refresh_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
+{
+ bool ret;
+ int seq;
+
+ do {
+ ret = false;
+ seq = read_seqbegin(&state->seqlock);
+ if (nfs4_state_match_open_stateid_other(state, dst)) {
+ dst->seqid = state->open_stateid.seqid;
+ ret = true;
+ }
+ } while (read_seqretry(&state->seqlock, seq));
+ return ret;
+}
+
+bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
{
+ bool ret;
const nfs4_stateid *src;
int seq;
do {
+ ret = false;
src = &zero_stateid;
seq = read_seqbegin(&state->seqlock);
- if (test_bit(NFS_OPEN_STATE, &state->flags))
+ if (test_bit(NFS_OPEN_STATE, &state->flags)) {
src = &state->open_stateid;
+ ret = true;
+ }
nfs4_stateid_copy(dst, src);
} while (read_seqretry(&state->seqlock, seq));
+ return ret;
}
/*
@@ -1177,7 +1207,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
return;
__module_get(THIS_MODULE);
- atomic_inc(&clp->cl_count);
+ refcount_inc(&clp->cl_count);
/* The rcu_read_lock() is not strictly necessary, as the state
* manager is the only thread that ever changes the rpc_xprt
@@ -1269,7 +1299,7 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp)
might_sleep();
- atomic_inc(&clp->cl_count);
+ refcount_inc(&clp->cl_count);
res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
nfs_wait_bit_killable, TASK_KILLABLE);
if (res)
@@ -1409,6 +1439,11 @@ void nfs_inode_find_state_and_recover(struct inode *inode,
found = true;
continue;
}
+ if (nfs4_stateid_match_other(&state->open_stateid, stateid) &&
+ nfs4_state_mark_reclaim_nograce(clp, state)) {
+ found = true;
+ continue;
+ }
if (nfs_state_lock_state_matches_stateid(state, stateid) &&
nfs4_state_mark_reclaim_nograce(clp, state))
found = true;
@@ -2510,7 +2545,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
break;
if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
break;
- } while (atomic_read(&clp->cl_count) > 1);
+ } while (refcount_read(&clp->cl_count) > 1);
return;
out_error:
if (strlen(section))
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index e7c6275519b0..a275fba93170 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -202,17 +202,13 @@ DECLARE_EVENT_CLASS(nfs4_clientid_event,
TP_ARGS(clp, error),
TP_STRUCT__entry(
- __string(dstaddr,
- rpc_peeraddr2str(clp->cl_rpcclient,
- RPC_DISPLAY_ADDR))
+ __string(dstaddr, clp->cl_hostname)
__field(int, error)
),
TP_fast_assign(
__entry->error = error;
- __assign_str(dstaddr,
- rpc_peeraddr2str(clp->cl_rpcclient,
- RPC_DISPLAY_ADDR));
+ __assign_str(dstaddr, clp->cl_hostname);
),
TP_printk(
@@ -1066,6 +1062,8 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_event,
DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_setattr);
DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_delegreturn);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update_wait);
DECLARE_EVENT_CLASS(nfs4_getattr_event,
TP_PROTO(
@@ -1133,9 +1131,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
__field(dev_t, dev)
__field(u32, fhandle)
__field(u64, fileid)
- __string(dstaddr, clp ?
- rpc_peeraddr2str(clp->cl_rpcclient,
- RPC_DISPLAY_ADDR) : "unknown")
+ __string(dstaddr, clp ? clp->cl_hostname : "unknown")
),
TP_fast_assign(
@@ -1148,9 +1144,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
__entry->fileid = 0;
__entry->dev = 0;
}
- __assign_str(dstaddr, clp ?
- rpc_peeraddr2str(clp->cl_rpcclient,
- RPC_DISPLAY_ADDR) : "unknown")
+ __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown")
),
TP_printk(
@@ -1192,9 +1186,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event,
__field(dev_t, dev)
__field(u32, fhandle)
__field(u64, fileid)
- __string(dstaddr, clp ?
- rpc_peeraddr2str(clp->cl_rpcclient,
- RPC_DISPLAY_ADDR) : "unknown")
+ __string(dstaddr, clp ? clp->cl_hostname : "unknown")
__field(int, stateid_seq)
__field(u32, stateid_hash)
),
@@ -1209,9 +1201,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event,
__entry->fileid = 0;
__entry->dev = 0;
}
- __assign_str(dstaddr, clp ?
- rpc_peeraddr2str(clp->cl_rpcclient,
- RPC_DISPLAY_ADDR) : "unknown")
+ __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown")
__entry->stateid_seq =
be32_to_cpu(stateid->seqid);
__entry->stateid_hash =
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 14ed9791ec9c..77c6729e57f0 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4385,6 +4385,14 @@ static int decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *state
return decode_stateid(xdr, stateid);
}
+static int decode_invalid_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ nfs4_stateid dummy;
+
+ nfs4_stateid_copy(stateid, &invalid_stateid);
+ return decode_stateid(xdr, &dummy);
+}
+
static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
{
int status;
@@ -4393,7 +4401,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
if (status != -EIO)
nfs_increment_open_seqid(status, res->seqid);
if (!status)
- status = decode_open_stateid(xdr, &res->stateid);
+ status = decode_invalid_stateid(xdr, &res->stateid);
return status;
}
@@ -6108,6 +6116,8 @@ static int decode_layoutreturn(struct xdr_stream *xdr,
res->lrs_present = be32_to_cpup(p);
if (res->lrs_present)
status = decode_layout_stateid(xdr, &res->stateid);
+ else
+ nfs4_stateid_copy(&res->stateid, &invalid_stateid);
return status;
out_overflow:
print_overflow_msg(__func__, xdr);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 3bcd669a3152..d602fe9e1ac8 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -251,7 +251,7 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
void
pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo)
{
- atomic_inc(&lo->plh_refcount);
+ refcount_inc(&lo->plh_refcount);
}
static struct pnfs_layout_hdr *
@@ -296,7 +296,7 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
pnfs_layoutreturn_before_put_layout_hdr(lo);
- if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+ if (refcount_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
if (!list_empty(&lo->plh_segs))
WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
pnfs_detach_layout_hdr(lo);
@@ -355,6 +355,24 @@ pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg,
}
/*
+ * Update the seqid of a layout stateid
+ */
+bool nfs4_refresh_layout_stateid(nfs4_stateid *dst, struct inode *inode)
+{
+ struct pnfs_layout_hdr *lo;
+ bool ret = false;
+
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (lo && nfs4_stateid_match_other(dst, &lo->plh_stateid)) {
+ dst->seqid = lo->plh_stateid.seqid;
+ ret = true;
+ }
+ spin_unlock(&inode->i_lock);
+ return ret;
+}
+
+/*
* Mark a pnfs_layout_hdr and all associated layout segments as invalid
*
* In order to continue using the pnfs_layout_hdr, a full recovery
@@ -395,14 +413,14 @@ pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
{
lo->plh_retry_timestamp = jiffies;
if (!test_and_set_bit(fail_bit, &lo->plh_flags))
- atomic_inc(&lo->plh_refcount);
+ refcount_inc(&lo->plh_refcount);
}
static void
pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
{
if (test_and_clear_bit(fail_bit, &lo->plh_flags))
- atomic_dec(&lo->plh_refcount);
+ refcount_dec(&lo->plh_refcount);
}
static void
@@ -450,7 +468,7 @@ pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
{
INIT_LIST_HEAD(&lseg->pls_list);
INIT_LIST_HEAD(&lseg->pls_lc_list);
- atomic_set(&lseg->pls_refcount, 1);
+ refcount_set(&lseg->pls_refcount, 1);
set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
lseg->pls_layout = lo;
lseg->pls_range = *range;
@@ -472,7 +490,7 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
list_del_init(&lseg->pls_list);
/* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
- atomic_dec(&lo->plh_refcount);
+ refcount_dec(&lo->plh_refcount);
if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
return;
if (list_empty(&lo->plh_segs) &&
@@ -507,13 +525,13 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
return;
dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
- atomic_read(&lseg->pls_refcount),
+ refcount_read(&lseg->pls_refcount),
test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
lo = lseg->pls_layout;
inode = lo->plh_inode;
- if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
+ if (refcount_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
spin_unlock(&inode->i_lock);
return;
@@ -551,7 +569,7 @@ pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
struct list_head *tmp_list)
{
- if (!atomic_dec_and_test(&lseg->pls_refcount))
+ if (!refcount_dec_and_test(&lseg->pls_refcount))
return false;
pnfs_layout_remove_lseg(lseg->pls_layout, lseg);
list_add(&lseg->pls_list, tmp_list);
@@ -570,7 +588,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
* outstanding io is finished.
*/
dprintk("%s: lseg %p ref %d\n", __func__, lseg,
- atomic_read(&lseg->pls_refcount));
+ refcount_read(&lseg->pls_refcount));
if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list))
rv = 1;
}
@@ -1451,7 +1469,7 @@ alloc_init_layout_hdr(struct inode *ino,
lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
if (!lo)
return NULL;
- atomic_set(&lo->plh_refcount, 1);
+ refcount_set(&lo->plh_refcount, 1);
INIT_LIST_HEAD(&lo->plh_layouts);
INIT_LIST_HEAD(&lo->plh_segs);
INIT_LIST_HEAD(&lo->plh_return_segs);
@@ -1513,7 +1531,7 @@ pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
if ((range->iomode == IOMODE_RW &&
ls_range->iomode != IOMODE_RW) ||
(range->iomode != ls_range->iomode &&
- strict_iomode == true) ||
+ strict_iomode) ||
!pnfs_lseg_range_intersecting(ls_range, range))
return 0;
@@ -1546,7 +1564,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
}
dprintk("%s:Return lseg %p ref %d\n",
- __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
+ __func__, ret, ret ? refcount_read(&ret->pls_refcount) : 0);
return ret;
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 87f144f14d1e..8d507c361d98 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,6 +30,7 @@
#ifndef FS_NFS_PNFS_H
#define FS_NFS_PNFS_H
+#include <linux/refcount.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
#include <linux/workqueue.h>
@@ -54,7 +55,7 @@ struct nfs4_pnfs_ds {
char *ds_remotestr; /* comma sep list of addrs */
struct list_head ds_addrs;
struct nfs_client *ds_clp;
- atomic_t ds_count;
+ refcount_t ds_count;
unsigned long ds_state;
#define NFS4DS_CONNECTING 0 /* ds is establishing connection */
};
@@ -63,7 +64,7 @@ struct pnfs_layout_segment {
struct list_head pls_list;
struct list_head pls_lc_list;
struct pnfs_layout_range pls_range;
- atomic_t pls_refcount;
+ refcount_t pls_refcount;
u32 pls_seq;
unsigned long pls_flags;
struct pnfs_layout_hdr *pls_layout;
@@ -179,7 +180,7 @@ struct pnfs_layoutdriver_type {
};
struct pnfs_layout_hdr {
- atomic_t plh_refcount;
+ refcount_t plh_refcount;
atomic_t plh_outstanding; /* number of RPCs out */
struct list_head plh_layouts; /* other client layouts */
struct list_head plh_bulk_destroy;
@@ -251,6 +252,7 @@ int pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
bool is_recall);
int pnfs_destroy_layouts_byclid(struct nfs_client *clp,
bool is_recall);
+bool nfs4_refresh_layout_stateid(nfs4_stateid *dst, struct inode *inode);
void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
const nfs4_stateid *new,
@@ -393,7 +395,7 @@ static inline struct pnfs_layout_segment *
pnfs_get_lseg(struct pnfs_layout_segment *lseg)
{
if (lseg) {
- atomic_inc(&lseg->pls_refcount);
+ refcount_inc(&lseg->pls_refcount);
smp_mb__after_atomic();
}
return lseg;
@@ -764,6 +766,11 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void)
{
}
+static inline bool nfs4_refresh_layout_stateid(nfs4_stateid *dst,
+ struct inode *inode)
+{
+ return false;
+}
#endif /* CONFIG_NFS_V4_1 */
#if IS_ENABLED(CONFIG_NFS_V4_2)
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 60da59be83b6..03aaa60c7768 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -338,7 +338,7 @@ print_ds(struct nfs4_pnfs_ds *ds)
" client %p\n"
" cl_exchange_flags %x\n",
ds->ds_remotestr,
- atomic_read(&ds->ds_count), ds->ds_clp,
+ refcount_read(&ds->ds_count), ds->ds_clp,
ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
}
@@ -451,7 +451,7 @@ static void destroy_ds(struct nfs4_pnfs_ds *ds)
void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds)
{
- if (atomic_dec_and_lock(&ds->ds_count,
+ if (refcount_dec_and_lock(&ds->ds_count,
&nfs4_ds_cache_lock)) {
list_del_init(&ds->ds_node);
spin_unlock(&nfs4_ds_cache_lock);
@@ -537,7 +537,7 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
INIT_LIST_HEAD(&ds->ds_addrs);
list_splice_init(dsaddrs, &ds->ds_addrs);
ds->ds_remotestr = remotestr;
- atomic_set(&ds->ds_count, 1);
+ refcount_set(&ds->ds_count, 1);
INIT_LIST_HEAD(&ds->ds_node);
ds->ds_clp = NULL;
list_add(&ds->ds_node, &nfs4_data_server_cache);
@@ -546,10 +546,10 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
} else {
kfree(remotestr);
kfree(ds);
- atomic_inc(&tmp_ds->ds_count);
+ refcount_inc(&tmp_ds->ds_count);
dprintk("%s data server %s found, inc'ed ds_count to %d\n",
__func__, tmp_ds->ds_remotestr,
- atomic_read(&tmp_ds->ds_count));
+ refcount_read(&tmp_ds->ds_count));
ds = tmp_ds;
}
spin_unlock(&nfs4_ds_cache_lock);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index c9d24bae3025..43cadb28db6e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1332,7 +1332,7 @@ static int nfs_parse_mount_options(char *raw,
mnt->options |= NFS_OPTION_MIGRATION;
break;
case Opt_nomigration:
- mnt->options &= NFS_OPTION_MIGRATION;
+ mnt->options &= ~NFS_OPTION_MIGRATION;
break;
/*
@@ -1456,18 +1456,21 @@ static int nfs_parse_mount_options(char *raw,
switch (token) {
case Opt_xprt_udp6:
protofamily = AF_INET6;
+ /* fall through */
case Opt_xprt_udp:
mnt->flags &= ~NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
break;
case Opt_xprt_tcp6:
protofamily = AF_INET6;
+ /* fall through */
case Opt_xprt_tcp:
mnt->flags |= NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
break;
case Opt_xprt_rdma6:
protofamily = AF_INET6;
+ /* fall through */
case Opt_xprt_rdma:
/* vector side protocols to TCP */
mnt->flags |= NFS_MOUNT_TCP;
@@ -1494,11 +1497,13 @@ static int nfs_parse_mount_options(char *raw,
switch (token) {
case Opt_xprt_udp6:
mountfamily = AF_INET6;
+ /* fall through */
case Opt_xprt_udp:
mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
break;
case Opt_xprt_tcp6:
mountfamily = AF_INET6;
+ /* fall through */
case Opt_xprt_tcp:
mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
break;
@@ -1988,9 +1993,9 @@ static int nfs23_validate_mount_data(void *options,
args->version = NFS_DEFAULT_VERSION;
switch (data->version) {
case 1:
- data->namlen = 0;
+ data->namlen = 0; /* fall through */
case 2:
- data->bsize = 0;
+ data->bsize = 0; /* fall through */
case 3:
if (data->flags & NFS_MOUNT_VER3)
goto out_no_v3;
@@ -1998,11 +2003,14 @@ static int nfs23_validate_mount_data(void *options,
memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
/* Turn off security negotiation */
extra_flags |= NFS_MOUNT_SECFLAVOUR;
+ /* fall through */
case 4:
if (data->flags & NFS_MOUNT_SECFLAVOUR)
goto out_no_sec;
+ /* fall through */
case 5:
memset(data->context, 0, sizeof(data->context));
+ /* fall through */
case 6:
if (data->flags & NFS_MOUNT_VER3) {
if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index babebbccae2a..5b5f464f6f2a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -487,10 +487,8 @@ try_again:
}
ret = nfs_page_group_lock(head);
- if (ret < 0) {
- nfs_unlock_and_release_request(head);
- return ERR_PTR(ret);
- }
+ if (ret < 0)
+ goto release_request;
/* lock each request in the page group */
total_bytes = head->wb_bytes;
@@ -515,8 +513,7 @@ try_again:
if (ret < 0) {
nfs_unroll_locks(inode, head, subreq);
nfs_release_request(subreq);
- nfs_unlock_and_release_request(head);
- return ERR_PTR(ret);
+ goto release_request;
}
}
/*
@@ -532,8 +529,8 @@ try_again:
nfs_page_group_unlock(head);
nfs_unroll_locks(inode, head, subreq);
nfs_unlock_and_release_request(subreq);
- nfs_unlock_and_release_request(head);
- return ERR_PTR(-EIO);
+ ret = -EIO;
+ goto release_request;
}
}
@@ -576,6 +573,10 @@ try_again:
/* still holds ref on head from nfs_page_find_head_request
* and still has lock on head from lock loop */
return head;
+
+release_request:
+ nfs_unlock_and_release_request(head);
+ return ERR_PTR(ret);
}
static void nfs_write_error_remove_page(struct nfs_page *req)
diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c
index 420d3a0ab258..897b299db55e 100644
--- a/fs/nfs_common/grace.c
+++ b/fs/nfs_common/grace.c
@@ -55,14 +55,7 @@ locks_end_grace(struct lock_manager *lm)
}
EXPORT_SYMBOL_GPL(locks_end_grace);
-/**
- * locks_in_grace
- *
- * Lock managers call this function to determine when it is OK for them
- * to answer ordinary lock requests, and when they should accept only
- * lock reclaims.
- */
-int
+static bool
__state_in_grace(struct net *net, bool open)
{
struct list_head *grace_list = net_generic(net, grace_net_id);
@@ -78,15 +71,22 @@ __state_in_grace(struct net *net, bool open)
return false;
}
-int locks_in_grace(struct net *net)
+/**
+ * locks_in_grace
+ *
+ * Lock managers call this function to determine when it is OK for them
+ * to answer ordinary lock requests, and when they should accept only
+ * lock reclaims.
+ */
+bool locks_in_grace(struct net *net)
{
- return __state_in_grace(net, 0);
+ return __state_in_grace(net, false);
}
EXPORT_SYMBOL_GPL(locks_in_grace);
-int opens_in_grace(struct net *net)
+bool opens_in_grace(struct net *net)
{
- return __state_in_grace(net, 1);
+ return __state_in_grace(net, true);
}
EXPORT_SYMBOL_GPL(opens_in_grace);
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
index 6dfede6d172a..84831253203d 100644
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c
@@ -12,6 +12,7 @@
#include <linux/nsproxy.h>
#include <linux/sunrpc/addr.h>
#include <linux/uaccess.h>
+#include <linux/kernel.h>
#include "state.h"
#include "netns.h"
@@ -126,8 +127,6 @@ static struct nfsd_fault_inject_op inject_ops[] = {
},
};
-#define NUM_INJECT_OPS (sizeof(inject_ops)/sizeof(struct nfsd_fault_inject_op))
-
int nfsd_fault_inject_init(void)
{
unsigned int i;
@@ -138,7 +137,7 @@ int nfsd_fault_inject_init(void)
if (!debug_dir)
goto fail;
- for (i = 0; i < NUM_INJECT_OPS; i++) {
+ for (i = 0; i < ARRAY_SIZE(inject_ops); i++) {
op = &inject_ops[i];
if (!debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd))
goto fail;
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 3714231a9d0f..1c91391f4805 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -107,7 +107,7 @@ struct nfsd_net {
bool lockd_up;
/* Time of server startup */
- struct timeval nfssvc_boot;
+ struct timespec64 nfssvc_boot;
/*
* Max number of connections this nfsd container will allow. Defaults
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f38acd905441..2758480555fa 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -748,8 +748,9 @@ nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p)
if (resp->status == 0) {
*p++ = htonl(resp->count);
*p++ = htonl(resp->committed);
- *p++ = htonl(nn->nfssvc_boot.tv_sec);
- *p++ = htonl(nn->nfssvc_boot.tv_usec);
+ /* unique identifier, y2038 overflow can be ignored */
+ *p++ = htonl((u32)nn->nfssvc_boot.tv_sec);
+ *p++ = htonl(nn->nfssvc_boot.tv_nsec);
}
return xdr_ressize_check(rqstp, p);
}
@@ -1119,8 +1120,9 @@ nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p)
p = encode_wcc_data(rqstp, p, &resp->fh);
/* Write verifier */
if (resp->status == 0) {
- *p++ = htonl(nn->nfssvc_boot.tv_sec);
- *p++ = htonl(nn->nfssvc_boot.tv_usec);
+ /* unique identifier, y2038 overflow can be ignored */
+ *p++ = htonl((u32)nn->nfssvc_boot.tv_sec);
+ *p++ = htonl(nn->nfssvc_boot.tv_nsec);
}
return xdr_ressize_check(rqstp, p);
}
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index ea45d954e8d7..7d888369f85a 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -336,7 +336,7 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
trace_layout_recall(&ls->ls_stid.sc_stateid);
- atomic_inc(&ls->ls_stid.sc_count);
+ refcount_inc(&ls->ls_stid.sc_count);
nfsd4_run_cb(&ls->ls_recall);
out_unlock:
@@ -441,7 +441,7 @@ nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
goto done;
}
- atomic_inc(&ls->ls_stid.sc_count);
+ refcount_inc(&ls->ls_stid.sc_count);
list_add_tail(&new->lo_perstate, &ls->ls_layouts);
new = NULL;
done:
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 8487486ec496..008ea0b627d0 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -485,9 +485,6 @@ static __be32
nfsd4_getfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
{
- if (!cstate->current_fh.fh_dentry)
- return nfserr_nofilehandle;
-
u->getfh = &cstate->current_fh;
return nfs_ok;
}
@@ -535,9 +532,6 @@ static __be32
nfsd4_savefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
{
- if (!cstate->current_fh.fh_dentry)
- return nfserr_nofilehandle;
-
fh_dup2(&cstate->save_fh, &cstate->current_fh);
if (HAS_STATE_ID(cstate, CURRENT_STATE_ID_FLAG)) {
memcpy(&cstate->save_stateid, &cstate->current_stateid, sizeof(stateid_t));
@@ -570,10 +564,11 @@ static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net)
/*
* This is opaque to client, so no need to byte-swap. Use
- * __force to keep sparse happy
+ * __force to keep sparse happy. y2038 time_t overflow is
+ * irrelevant in this usage.
*/
verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec;
- verf[1] = (__force __be32)nn->nfssvc_boot.tv_usec;
+ verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec;
memcpy(verifier->data, verf, sizeof(verifier->data));
}
@@ -703,10 +698,8 @@ nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
{
struct nfsd4_link *link = &u->link;
- __be32 status = nfserr_nofilehandle;
+ __be32 status;
- if (!cstate->save_fh.fh_dentry)
- return status;
status = nfsd_link(rqstp, &cstate->current_fh,
link->li_name, link->li_namelen, &cstate->save_fh);
if (!status)
@@ -850,10 +843,8 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
{
struct nfsd4_rename *rename = &u->rename;
- __be32 status = nfserr_nofilehandle;
+ __be32 status;
- if (!cstate->save_fh.fh_dentry)
- return status;
if (opens_in_grace(SVC_NET(rqstp)) &&
!(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK))
return nfserr_grace;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0c04f81aa63b..b82817767b9d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -359,7 +359,7 @@ put_nfs4_file(struct nfs4_file *fi)
{
might_lock(&state_lock);
- if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) {
+ if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
hlist_del_rcu(&fi->fi_hash);
spin_unlock(&state_lock);
WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
@@ -568,7 +568,7 @@ alloc_clnt_odstate(struct nfs4_client *clp)
co = kmem_cache_zalloc(odstate_slab, GFP_KERNEL);
if (co) {
co->co_client = clp;
- atomic_set(&co->co_odcount, 1);
+ refcount_set(&co->co_odcount, 1);
}
return co;
}
@@ -586,7 +586,7 @@ static inline void
get_clnt_odstate(struct nfs4_clnt_odstate *co)
{
if (co)
- atomic_inc(&co->co_odcount);
+ refcount_inc(&co->co_odcount);
}
static void
@@ -598,7 +598,7 @@ put_clnt_odstate(struct nfs4_clnt_odstate *co)
return;
fp = co->co_file;
- if (atomic_dec_and_lock(&co->co_odcount, &fp->fi_lock)) {
+ if (refcount_dec_and_lock(&co->co_odcount, &fp->fi_lock)) {
list_del(&co->co_perfile);
spin_unlock(&fp->fi_lock);
@@ -656,7 +656,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *sla
stid->sc_stateid.si_opaque.so_id = new_id;
stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid;
/* Will be incremented before return to client: */
- atomic_set(&stid->sc_count, 1);
+ refcount_set(&stid->sc_count, 1);
spin_lock_init(&stid->sc_lock);
/*
@@ -813,7 +813,7 @@ nfs4_put_stid(struct nfs4_stid *s)
might_lock(&clp->cl_lock);
- if (!atomic_dec_and_lock(&s->sc_count, &clp->cl_lock)) {
+ if (!refcount_dec_and_lock(&s->sc_count, &clp->cl_lock)) {
wake_up_all(&close_wq);
return;
}
@@ -913,7 +913,7 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
if (status)
return status;
++fp->fi_delegees;
- atomic_inc(&dp->dl_stid.sc_count);
+ refcount_inc(&dp->dl_stid.sc_count);
dp->dl_stid.sc_type = NFS4_DELEG_STID;
list_add(&dp->dl_perfile, &fp->fi_delegations);
list_add(&dp->dl_perclnt, &clp->cl_delegations);
@@ -1214,7 +1214,7 @@ static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp,
WARN_ON_ONCE(!list_empty(&stp->st_locks));
- if (!atomic_dec_and_test(&s->sc_count)) {
+ if (!refcount_dec_and_test(&s->sc_count)) {
wake_up_all(&close_wq);
return;
}
@@ -1439,8 +1439,10 @@ free_session_slots(struct nfsd4_session *ses)
{
int i;
- for (i = 0; i < ses->se_fchannel.maxreqs; i++)
+ for (i = 0; i < ses->se_fchannel.maxreqs; i++) {
+ free_svc_cred(&ses->se_slots[i]->sl_cred);
kfree(ses->se_slots[i]);
+ }
}
/*
@@ -1472,6 +1474,11 @@ static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca)
spin_lock(&nfsd_drc_lock);
avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION,
nfsd_drc_max_mem - nfsd_drc_mem_used);
+ /*
+ * Never use more than a third of the remaining memory,
+ * unless it's the only way to give this client a slot:
+ */
+ avail = clamp_t(int, avail, slotsize, avail/3);
num = min_t(int, num, avail / slotsize);
nfsd_drc_mem_used += num * slotsize;
spin_unlock(&nfsd_drc_lock);
@@ -2072,7 +2079,7 @@ find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
s = find_stateid_locked(cl, t);
if (s != NULL) {
if (typemask & s->sc_type)
- atomic_inc(&s->sc_count);
+ refcount_inc(&s->sc_count);
else
s = NULL;
}
@@ -2287,14 +2294,18 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
dprintk("--> %s slot %p\n", __func__, slot);
+ slot->sl_flags |= NFSD4_SLOT_INITIALIZED;
slot->sl_opcnt = resp->opcnt;
slot->sl_status = resp->cstate.status;
+ free_svc_cred(&slot->sl_cred);
+ copy_cred(&slot->sl_cred, &resp->rqstp->rq_cred);
- slot->sl_flags |= NFSD4_SLOT_INITIALIZED;
- if (nfsd4_not_cached(resp)) {
- slot->sl_datalen = 0;
+ if (!nfsd4_cache_this(resp)) {
+ slot->sl_flags &= ~NFSD4_SLOT_CACHED;
return;
}
+ slot->sl_flags |= NFSD4_SLOT_CACHED;
+
base = resp->cstate.data_offset;
slot->sl_datalen = buf->len - base;
if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen))
@@ -2321,8 +2332,16 @@ nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args,
op = &args->ops[resp->opcnt - 1];
nfsd4_encode_operation(resp, op);
- /* Return nfserr_retry_uncached_rep in next operation. */
- if (args->opcnt > 1 && !(slot->sl_flags & NFSD4_SLOT_CACHETHIS)) {
+ if (slot->sl_flags & NFSD4_SLOT_CACHED)
+ return op->status;
+ if (args->opcnt == 1) {
+ /*
+ * The original operation wasn't a solo sequence--we
+ * always cache those--so this retry must not match the
+ * original:
+ */
+ op->status = nfserr_seq_false_retry;
+ } else {
op = &args->ops[resp->opcnt++];
op->status = nfserr_retry_uncached_rep;
nfsd4_encode_operation(resp, op);
@@ -2986,6 +3005,34 @@ static bool nfsd4_request_too_big(struct svc_rqst *rqstp,
return xb->len > session->se_fchannel.maxreq_sz;
}
+static bool replay_matches_cache(struct svc_rqst *rqstp,
+ struct nfsd4_sequence *seq, struct nfsd4_slot *slot)
+{
+ struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+
+ if ((bool)(slot->sl_flags & NFSD4_SLOT_CACHETHIS) !=
+ (bool)seq->cachethis)
+ return false;
+ /*
+ * If there's an error than the reply can have fewer ops than
+ * the call. But if we cached a reply with *more* ops than the
+ * call you're sending us now, then this new call is clearly not
+ * really a replay of the old one:
+ */
+ if (slot->sl_opcnt < argp->opcnt)
+ return false;
+ /* This is the only check explicitly called by spec: */
+ if (!same_creds(&rqstp->rq_cred, &slot->sl_cred))
+ return false;
+ /*
+ * There may be more comparisons we could actually do, but the
+ * spec doesn't require us to catch every case where the calls
+ * don't match (that would require caching the call as well as
+ * the reply), so we don't bother.
+ */
+ return true;
+}
+
__be32
nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
@@ -3045,6 +3092,9 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfserr_seq_misordered;
if (!(slot->sl_flags & NFSD4_SLOT_INITIALIZED))
goto out_put_session;
+ status = nfserr_seq_false_retry;
+ if (!replay_matches_cache(rqstp, seq, slot))
+ goto out_put_session;
cstate->slot = slot;
cstate->session = session;
cstate->clp = clp;
@@ -3351,7 +3401,7 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
{
lockdep_assert_held(&state_lock);
- atomic_set(&fp->fi_ref, 1);
+ refcount_set(&fp->fi_ref, 1);
spin_lock_init(&fp->fi_lock);
INIT_LIST_HEAD(&fp->fi_stateids);
INIT_LIST_HEAD(&fp->fi_delegations);
@@ -3514,7 +3564,7 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
continue;
if (local->st_stateowner == &oo->oo_owner) {
ret = local;
- atomic_inc(&ret->st_stid.sc_count);
+ refcount_inc(&ret->st_stid.sc_count);
break;
}
}
@@ -3573,7 +3623,7 @@ init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open)
goto out_unlock;
open->op_stp = NULL;
- atomic_inc(&stp->st_stid.sc_count);
+ refcount_inc(&stp->st_stid.sc_count);
stp->st_stid.sc_type = NFS4_OPEN_STID;
INIT_LIST_HEAD(&stp->st_locks);
stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner);
@@ -3621,7 +3671,7 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
* there should be no danger of the refcount going back up again at
* this point.
*/
- wait_event(close_wq, atomic_read(&s->st_stid.sc_count) == 2);
+ wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2);
release_all_access(s);
if (s->st_stid.sc_file) {
@@ -3647,7 +3697,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) {
if (fh_match(&fp->fi_fhandle, fh)) {
- if (atomic_inc_not_zero(&fp->fi_ref))
+ if (refcount_inc_not_zero(&fp->fi_ref))
return fp;
}
}
@@ -3783,7 +3833,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
* lock) we know the server hasn't removed the lease yet, we know
* it's safe to take a reference.
*/
- atomic_inc(&dp->dl_stid.sc_count);
+ refcount_inc(&dp->dl_stid.sc_count);
nfsd4_run_cb(&dp->dl_recall);
}
@@ -3966,7 +4016,8 @@ static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, statei
{
struct nfs4_stid *ret;
- ret = find_stateid_by_type(cl, s, NFS4_DELEG_STID);
+ ret = find_stateid_by_type(cl, s,
+ NFS4_DELEG_STID|NFS4_REVOKED_DELEG_STID);
if (!ret)
return NULL;
return delegstateid(ret);
@@ -3989,6 +4040,12 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open,
deleg = find_deleg_stateid(cl, &open->op_delegate_stateid);
if (deleg == NULL)
goto out;
+ if (deleg->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) {
+ nfs4_put_stid(&deleg->dl_stid);
+ if (cl->cl_minorversion)
+ status = nfserr_deleg_revoked;
+ goto out;
+ }
flags = share_access_to_flags(open->op_share_access);
status = nfs4_check_delegmode(deleg, flags);
if (status) {
@@ -4858,6 +4915,16 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
struct nfs4_stid **s, struct nfsd_net *nn)
{
__be32 status;
+ bool return_revoked = false;
+
+ /*
+ * only return revoked delegations if explicitly asked.
+ * otherwise we report revoked or bad_stateid status.
+ */
+ if (typemask & NFS4_REVOKED_DELEG_STID)
+ return_revoked = true;
+ else if (typemask & NFS4_DELEG_STID)
+ typemask |= NFS4_REVOKED_DELEG_STID;
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
return nfserr_bad_stateid;
@@ -4872,6 +4939,12 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
*s = find_stateid_by_type(cstate->clp, stateid, typemask);
if (!*s)
return nfserr_bad_stateid;
+ if (((*s)->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) {
+ nfs4_put_stid(*s);
+ if (cstate->minorversion)
+ return nfserr_deleg_revoked;
+ return nfserr_bad_stateid;
+ }
return nfs_ok;
}
@@ -5071,7 +5144,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
ret = nfserr_locks_held;
break;
case NFS4_LOCK_STID:
- atomic_inc(&s->sc_count);
+ refcount_inc(&s->sc_count);
spin_unlock(&cl->cl_lock);
ret = nfsd4_free_lock_stateid(stateid, s);
goto out;
@@ -5578,7 +5651,7 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
lockdep_assert_held(&clp->cl_lock);
- atomic_inc(&stp->st_stid.sc_count);
+ refcount_inc(&stp->st_stid.sc_count);
stp->st_stid.sc_type = NFS4_LOCK_STID;
stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner);
get_nfs4_file(fp);
@@ -5604,7 +5677,7 @@ find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp)
list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) {
if (lst->st_stid.sc_file == fp) {
- atomic_inc(&lst->st_stid.sc_count);
+ refcount_inc(&lst->st_stid.sc_count);
return lst;
}
}
@@ -7006,8 +7079,8 @@ nfs4_state_start_net(struct net *net)
nn->nfsd4_manager.block_opens = true;
locks_start_grace(net, &nn->nfsd4_manager);
nfsd4_client_tracking_init(net);
- printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n",
- nn->nfsd4_grace, net);
+ printk(KERN_INFO "NFSD: starting %ld-second grace period (net %x)\n",
+ nn->nfsd4_grace, net->ns.inum);
queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
return 0;
}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index e02bd2783124..33117d4ffce0 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -447,7 +447,7 @@ void nfsd_reset_versions(void)
*/
static void set_max_drc(void)
{
- #define NFSD_DRC_SIZE_SHIFT 10
+ #define NFSD_DRC_SIZE_SHIFT 7
nfsd_drc_max_mem = (nr_free_buffer_pages()
>> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;
nfsd_drc_mem_used = 0;
@@ -517,7 +517,7 @@ int nfsd_create_serv(struct net *net)
register_inet6addr_notifier(&nfsd_inet6addr_notifier);
#endif
}
- do_gettimeofday(&nn->nfssvc_boot); /* record boot time */
+ ktime_get_real_ts64(&nn->nfssvc_boot); /* record boot time */
return 0;
}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 005c911b34ac..f3772ea8ba0d 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -36,6 +36,7 @@
#define _NFSD4_STATE_H
#include <linux/idr.h>
+#include <linux/refcount.h>
#include <linux/sunrpc/svc_xprt.h>
#include "nfsfh.h"
@@ -83,7 +84,7 @@ struct nfsd4_callback_ops {
* fields that are of general use to any stateid.
*/
struct nfs4_stid {
- atomic_t sc_count;
+ refcount_t sc_count;
#define NFS4_OPEN_STID 1
#define NFS4_LOCK_STID 2
#define NFS4_DELEG_STID 4
@@ -169,11 +170,13 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
struct nfsd4_slot {
u32 sl_seqid;
__be32 sl_status;
+ struct svc_cred sl_cred;
u32 sl_datalen;
u16 sl_opcnt;
#define NFSD4_SLOT_INUSE (1 << 0)
#define NFSD4_SLOT_CACHETHIS (1 << 1)
#define NFSD4_SLOT_INITIALIZED (1 << 2)
+#define NFSD4_SLOT_CACHED (1 << 3)
u8 sl_flags;
char sl_data[];
};
@@ -465,7 +468,7 @@ struct nfs4_clnt_odstate {
struct nfs4_client *co_client;
struct nfs4_file *co_file;
struct list_head co_perfile;
- atomic_t co_odcount;
+ refcount_t co_odcount;
};
/*
@@ -481,7 +484,7 @@ struct nfs4_clnt_odstate {
* the global state_lock spinlock.
*/
struct nfs4_file {
- atomic_t fi_ref;
+ refcount_t fi_ref;
spinlock_t fi_lock;
struct hlist_node fi_hash; /* hash on fi_fhandle */
struct list_head fi_stateids;
@@ -634,7 +637,7 @@ struct nfs4_file *find_file(struct knfsd_fh *fh);
void put_nfs4_file(struct nfs4_file *fi);
static inline void get_nfs4_file(struct nfs4_file *fi)
{
- atomic_inc(&fi->fi_ref);
+ refcount_inc(&fi->fi_ref);
}
struct file *find_any_file(struct nfs4_file *f);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 1e4edbf70052..bc29511b6405 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -649,9 +649,18 @@ static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp)
return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE;
}
-static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
+/*
+ * The session reply cache only needs to cache replies that the client
+ * actually asked us to. But it's almost free for us to cache compounds
+ * consisting of only a SEQUENCE op, so we may as well cache those too.
+ * Also, the protocol doesn't give us a convenient response in the case
+ * of a replay of a solo SEQUENCE op that wasn't cached
+ * (RETRY_UNCACHED_REP can only be returned in the second op of a
+ * compound).
+ */
+static inline bool nfsd4_cache_this(struct nfsd4_compoundres *resp)
{
- return !(resp->cstate.slot->sl_flags & NFSD4_SLOT_CACHETHIS)
+ return (resp->cstate.slot->sl_flags & NFSD4_SLOT_CACHETHIS)
|| nfsd4_is_solo_sequence(resp);
}
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 515d13c196da..1a2894aa0194 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -150,7 +150,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
if (err)
return err;
- inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+ inode = nilfs_new_inode(dir, S_IFLNK | 0777);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index f65392fecb5c..f572538dcc4f 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1954,8 +1954,6 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
err, ii->vfs_inode.i_ino);
return err;
}
- mark_buffer_dirty(ibh);
- nilfs_mdt_mark_dirty(ifile);
spin_lock(&nilfs->ns_inode_lock);
if (likely(!ii->i_bh))
ii->i_bh = ibh;
@@ -1964,6 +1962,10 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
goto retry;
}
+ // Always redirty the buffer to avoid race condition
+ mark_buffer_dirty(ii->i_bh);
+ nilfs_mdt_mark_dirty(ifile);
+
clear_bit(NILFS_I_QUEUED, &ii->i_state);
set_bit(NILFS_I_BUSY, &ii->i_state);
list_move_tail(&ii->i_dirty, &sci->sc_dirty_files);
@@ -2400,11 +2402,11 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
return err;
}
-static void nilfs_construction_timeout(unsigned long data)
+static void nilfs_construction_timeout(struct timer_list *t)
{
- struct task_struct *p = (struct task_struct *)data;
+ struct nilfs_sc_info *sci = from_timer(sci, t, sc_timer);
- wake_up_process(p);
+ wake_up_process(sci->sc_timer_task);
}
static void
@@ -2542,8 +2544,7 @@ static int nilfs_segctor_thread(void *arg)
struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
int timeout = 0;
- sci->sc_timer.data = (unsigned long)current;
- sci->sc_timer.function = nilfs_construction_timeout;
+ sci->sc_timer_task = current;
/* start sync. */
sci->sc_task = current;
@@ -2674,7 +2675,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
INIT_LIST_HEAD(&sci->sc_gc_inodes);
INIT_LIST_HEAD(&sci->sc_iput_queue);
INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func);
- init_timer(&sci->sc_timer);
+ timer_setup(&sci->sc_timer, nilfs_construction_timeout, 0);
sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 1060949d7dd2..84084a4d9b3e 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -180,6 +180,7 @@ struct nilfs_sc_info {
unsigned long sc_watermark;
struct timer_list sc_timer;
+ struct task_struct *sc_timer_task;
struct task_struct *sc_task;
};
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 1541a1e9221a..1341a41e7b43 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -630,22 +630,22 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
}
/**
- * nilfs_sufile_truncate_range - truncate range of segment array
- * @sufile: inode of segment usage file
- * @start: start segment number (inclusive)
- * @end: end segment number (inclusive)
- *
- * Return Value: On success, 0 is returned. On error, one of the
- * following negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-EINVAL - Invalid number of segments specified
- *
- * %-EBUSY - Dirty or active segments are present in the range
- */
+ * nilfs_sufile_truncate_range - truncate range of segment array
+ * @sufile: inode of segment usage file
+ * @start: start segment number (inclusive)
+ * @end: end segment number (inclusive)
+ *
+ * Return Value: On success, 0 is returned. On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid number of segments specified
+ *
+ * %-EBUSY - Dirty or active segments are present in the range
+ */
static int nilfs_sufile_truncate_range(struct inode *sufile,
__u64 start, __u64 end)
{
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 4fc018dfcfae..3ce20cd44a20 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -160,7 +160,6 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
ii->i_bh = NULL;
ii->i_state = 0;
ii->i_cno = 0;
- ii->vfs_inode.i_version = 1;
nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode);
return &ii->vfs_inode;
}
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 2dd75bf619ad..afebb5067cec 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -737,7 +737,7 @@ struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno)
} else if (cno > root->cno) {
n = n->rb_right;
} else {
- atomic_inc(&root->count);
+ refcount_inc(&root->count);
spin_unlock(&nilfs->ns_cptree_lock);
return root;
}
@@ -776,7 +776,7 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
} else if (cno > root->cno) {
p = &(*p)->rb_right;
} else {
- atomic_inc(&root->count);
+ refcount_inc(&root->count);
spin_unlock(&nilfs->ns_cptree_lock);
kfree(new);
return root;
@@ -786,7 +786,7 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
new->cno = cno;
new->ifile = NULL;
new->nilfs = nilfs;
- atomic_set(&new->count, 1);
+ refcount_set(&new->count, 1);
atomic64_set(&new->inodes_count, 0);
atomic64_set(&new->blocks_count, 0);
@@ -806,7 +806,7 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
void nilfs_put_root(struct nilfs_root *root)
{
- if (atomic_dec_and_test(&root->count)) {
+ if (refcount_dec_and_test(&root->count)) {
struct the_nilfs *nilfs = root->nilfs;
nilfs_sysfs_delete_snapshot_group(root);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index b305c6f033e7..883d732b0259 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -27,6 +27,7 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/slab.h>
+#include <linux/refcount.h>
struct nilfs_sc_info;
struct nilfs_sysfs_dev_subgroups;
@@ -246,7 +247,7 @@ struct nilfs_root {
__u64 cno;
struct rb_node rb_node;
- atomic_t count;
+ refcount_t count;
struct the_nilfs *nilfs;
struct inode *ifile;
@@ -299,7 +300,7 @@ void nilfs_swap_super_block(struct the_nilfs *);
static inline void nilfs_get_root(struct nilfs_root *root)
{
- atomic_inc(&root->count);
+ refcount_inc(&root->count);
}
static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index c2d8233b1e82..480ea059a680 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -155,13 +155,11 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
int orangefs_init_acl(struct inode *inode, struct inode *dir)
{
- struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
struct posix_acl *default_acl, *acl;
umode_t mode = inode->i_mode;
+ struct iattr iattr;
int error = 0;
- ClearModeFlag(orangefs_inode);
-
error = posix_acl_create(dir, &mode, &default_acl, &acl);
if (error)
return error;
@@ -180,9 +178,11 @@ int orangefs_init_acl(struct inode *inode, struct inode *dir)
/* If mode of the inode was changed, then do a forcible ->setattr */
if (mode != inode->i_mode) {
- SetModeFlag(orangefs_inode);
+ memset(&iattr, 0, sizeof iattr);
inode->i_mode = mode;
- orangefs_flush_inode(inode);
+ iattr.ia_mode = mode;
+ iattr.ia_valid |= ATTR_MODE;
+ orangefs_inode_setattr(inode, &iattr);
}
return error;
diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
index a8cc588d6224..e2c2699d8016 100644
--- a/fs/orangefs/dir.c
+++ b/fs/orangefs/dir.c
@@ -386,7 +386,6 @@ static int orangefs_dir_release(struct inode *inode, struct file *file)
{
struct orangefs_dir *od = file->private_data;
struct orangefs_dir_part *part = od->part;
- orangefs_flush_inode(inode);
while (part) {
struct orangefs_dir_part *next = part->next;
vfree(part);
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index e4a8e6a7eb17..1668fd645c45 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -383,9 +383,15 @@ out:
if (type == ORANGEFS_IO_READ) {
file_accessed(file);
} else {
- SetMtimeFlag(orangefs_inode);
- inode->i_mtime = current_time(inode);
- mark_inode_dirty_sync(inode);
+ file_update_time(file);
+ /*
+ * Must invalidate to ensure write loop doesn't
+ * prevent kernel from reading updated
+ * attribute. Size probably changed because of
+ * the write, and other clients could update
+ * any other attribute.
+ */
+ orangefs_inode->getattr_time = jiffies - 1;
}
}
@@ -615,8 +621,6 @@ static int orangefs_file_release(struct inode *inode, struct file *file)
"orangefs_file_release: called on %pD\n",
file);
- orangefs_flush_inode(inode);
-
/*
* remove all associated inode pages from the page cache and
* readahead cache (if any); this forces an expensive refresh of
@@ -666,8 +670,6 @@ static int orangefs_fsync(struct file *file,
ret);
op_release(new_op);
-
- orangefs_flush_inode(file_inode(file));
return ret;
}
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 28825a5b6d09..fe1d705ad91f 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -290,6 +290,22 @@ int orangefs_permission(struct inode *inode, int mask)
return generic_permission(inode, mask);
}
+int orangefs_update_time(struct inode *inode, struct timespec *time, int flags)
+{
+ struct iattr iattr;
+ gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_update_time: %pU\n",
+ get_khandle_from_ino(inode));
+ generic_update_time(inode, time, flags);
+ memset(&iattr, 0, sizeof iattr);
+ if (flags & S_ATIME)
+ iattr.ia_valid |= ATTR_ATIME;
+ if (flags & S_CTIME)
+ iattr.ia_valid |= ATTR_CTIME;
+ if (flags & S_MTIME)
+ iattr.ia_valid |= ATTR_MTIME;
+ return orangefs_inode_setattr(inode, &iattr);
+}
+
/* ORANGEDS2 implementation of VFS inode operations for files */
const struct inode_operations orangefs_file_inode_operations = {
.get_acl = orangefs_get_acl,
@@ -298,6 +314,7 @@ const struct inode_operations orangefs_file_inode_operations = {
.getattr = orangefs_getattr,
.listxattr = orangefs_listxattr,
.permission = orangefs_permission,
+ .update_time = orangefs_update_time,
};
static int orangefs_init_iops(struct inode *inode)
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 7e9e5d0ea3bc..c98bba2dbc94 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -22,7 +22,9 @@ static int orangefs_create(struct inode *dir,
{
struct orangefs_inode_s *parent = ORANGEFS_I(dir);
struct orangefs_kernel_op_s *new_op;
+ struct orangefs_object_kref ref;
struct inode *inode;
+ struct iattr iattr;
int ret;
gossip_debug(GOSSIP_NAME_DEBUG, "%s: %pd\n",
@@ -55,8 +57,10 @@ static int orangefs_create(struct inode *dir,
if (ret < 0)
goto out;
- inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0,
- &new_op->downcall.resp.create.refn);
+ ref = new_op->downcall.resp.create.refn;
+ op_release(new_op);
+
+ inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0, &ref);
if (IS_ERR(inode)) {
gossip_err("%s: Failed to allocate inode for file :%pd:\n",
__func__,
@@ -82,12 +86,13 @@ static int orangefs_create(struct inode *dir,
__func__,
dentry);
- SetMtimeFlag(parent);
dir->i_mtime = dir->i_ctime = current_time(dir);
+ memset(&iattr, 0, sizeof iattr);
+ iattr.ia_valid |= ATTR_MTIME;
+ orangefs_inode_setattr(dir, &iattr);
mark_inode_dirty_sync(dir);
ret = 0;
out:
- op_release(new_op);
gossip_debug(GOSSIP_NAME_DEBUG,
"%s: %pd: returning %d\n",
__func__,
@@ -221,6 +226,7 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
struct orangefs_inode_s *parent = ORANGEFS_I(dir);
struct orangefs_kernel_op_s *new_op;
+ struct iattr iattr;
int ret;
gossip_debug(GOSSIP_NAME_DEBUG,
@@ -253,8 +259,10 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry)
if (!ret) {
drop_nlink(inode);
- SetMtimeFlag(parent);
dir->i_mtime = dir->i_ctime = current_time(dir);
+ memset(&iattr, 0, sizeof iattr);
+ iattr.ia_valid |= ATTR_MTIME;
+ orangefs_inode_setattr(dir, &iattr);
mark_inode_dirty_sync(dir);
}
return ret;
@@ -266,7 +274,9 @@ static int orangefs_symlink(struct inode *dir,
{
struct orangefs_inode_s *parent = ORANGEFS_I(dir);
struct orangefs_kernel_op_s *new_op;
+ struct orangefs_object_kref ref;
struct inode *inode;
+ struct iattr iattr;
int mode = 755;
int ret;
@@ -307,8 +317,10 @@ static int orangefs_symlink(struct inode *dir,
goto out;
}
- inode = orangefs_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0,
- &new_op->downcall.resp.sym.refn);
+ ref = new_op->downcall.resp.sym.refn;
+ op_release(new_op);
+
+ inode = orangefs_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0, &ref);
if (IS_ERR(inode)) {
gossip_err
("*** Failed to allocate orangefs symlink inode\n");
@@ -331,12 +343,13 @@ static int orangefs_symlink(struct inode *dir,
get_khandle_from_ino(inode),
dentry);
- SetMtimeFlag(parent);
dir->i_mtime = dir->i_ctime = current_time(dir);
+ memset(&iattr, 0, sizeof iattr);
+ iattr.ia_valid |= ATTR_MTIME;
+ orangefs_inode_setattr(dir, &iattr);
mark_inode_dirty_sync(dir);
ret = 0;
out:
- op_release(new_op);
return ret;
}
@@ -344,7 +357,9 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
{
struct orangefs_inode_s *parent = ORANGEFS_I(dir);
struct orangefs_kernel_op_s *new_op;
+ struct orangefs_object_kref ref;
struct inode *inode;
+ struct iattr iattr;
int ret;
new_op = op_alloc(ORANGEFS_VFS_OP_MKDIR);
@@ -373,8 +388,10 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
goto out;
}
- inode = orangefs_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0,
- &new_op->downcall.resp.mkdir.refn);
+ ref = new_op->downcall.resp.mkdir.refn;
+ op_release(new_op);
+
+ inode = orangefs_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0, &ref);
if (IS_ERR(inode)) {
gossip_err("*** Failed to allocate orangefs dir inode\n");
ret = PTR_ERR(inode);
@@ -400,11 +417,12 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
* NOTE: we have no good way to keep nlink consistent for directories
* across clients; keep constant at 1.
*/
- SetMtimeFlag(parent);
dir->i_mtime = dir->i_ctime = current_time(dir);
+ memset(&iattr, 0, sizeof iattr);
+ iattr.ia_valid |= ATTR_MTIME;
+ orangefs_inode_setattr(dir, &iattr);
mark_inode_dirty_sync(dir);
out:
- op_release(new_op);
return ret;
}
@@ -470,4 +488,5 @@ const struct inode_operations orangefs_dir_inode_operations = {
.getattr = orangefs_getattr,
.listxattr = orangefs_listxattr,
.permission = orangefs_permission,
+ .update_time = orangefs_update_time,
};
diff --git a/fs/orangefs/orangefs-debug.h b/fs/orangefs/orangefs-debug.h
index b6001bb28f5a..c7db56a31b92 100644
--- a/fs/orangefs/orangefs-debug.h
+++ b/fs/orangefs/orangefs-debug.h
@@ -15,8 +15,10 @@
#ifdef __KERNEL__
#include <linux/types.h>
+#include <linux/kernel.h>
#else
#include <stdint.h>
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
#endif
#define GOSSIP_NO_DEBUG (__u64)0
@@ -88,6 +90,6 @@ static struct __keyword_mask_s s_kmod_keyword_mask_map[] = {
};
static const int num_kmod_keyword_mask_map = (int)
- (sizeof(s_kmod_keyword_mask_map) / sizeof(struct __keyword_mask_s));
+ (ARRAY_SIZE(s_kmod_keyword_mask_map));
#endif /* __ORANGEFS_DEBUG_H */
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 004af348fb80..97adf7d100b5 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -209,37 +209,10 @@ struct orangefs_inode_s {
struct inode vfs_inode;
sector_t last_failed_block_index_read;
- /*
- * State of in-memory attributes not yet flushed to disk associated
- * with this object
- */
- unsigned long pinode_flags;
-
unsigned long getattr_time;
u32 getattr_mask;
};
-#define P_ATIME_FLAG 0
-#define P_MTIME_FLAG 1
-#define P_CTIME_FLAG 2
-#define P_MODE_FLAG 3
-
-#define ClearAtimeFlag(pinode) clear_bit(P_ATIME_FLAG, &(pinode)->pinode_flags)
-#define SetAtimeFlag(pinode) set_bit(P_ATIME_FLAG, &(pinode)->pinode_flags)
-#define AtimeFlag(pinode) test_bit(P_ATIME_FLAG, &(pinode)->pinode_flags)
-
-#define ClearMtimeFlag(pinode) clear_bit(P_MTIME_FLAG, &(pinode)->pinode_flags)
-#define SetMtimeFlag(pinode) set_bit(P_MTIME_FLAG, &(pinode)->pinode_flags)
-#define MtimeFlag(pinode) test_bit(P_MTIME_FLAG, &(pinode)->pinode_flags)
-
-#define ClearCtimeFlag(pinode) clear_bit(P_CTIME_FLAG, &(pinode)->pinode_flags)
-#define SetCtimeFlag(pinode) set_bit(P_CTIME_FLAG, &(pinode)->pinode_flags)
-#define CtimeFlag(pinode) test_bit(P_CTIME_FLAG, &(pinode)->pinode_flags)
-
-#define ClearModeFlag(pinode) clear_bit(P_MODE_FLAG, &(pinode)->pinode_flags)
-#define SetModeFlag(pinode) set_bit(P_MODE_FLAG, &(pinode)->pinode_flags)
-#define ModeFlag(pinode) test_bit(P_MODE_FLAG, &(pinode)->pinode_flags)
-
/* per superblock private orangefs info */
struct orangefs_sb_info_s {
struct orangefs_khandle root_khandle;
@@ -275,12 +248,6 @@ struct orangefs_kiocb_s {
/* orangefs kernel operation type */
struct orangefs_kernel_op_s *op;
- /* The user space buffers from/to which I/O is being staged */
- struct iovec *iov;
-
- /* number of elements in the iovector */
- unsigned long nr_segs;
-
/* set to indicate the type of the operation */
int rw;
@@ -442,6 +409,8 @@ int orangefs_getattr(const struct path *path, struct kstat *stat,
int orangefs_permission(struct inode *inode, int mask);
+int orangefs_update_time(struct inode *, struct timespec *, int);
+
/*
* defined in xattr.c
*/
@@ -484,8 +453,6 @@ bool __is_daemon_in_service(void);
*/
__s32 fsid_of_op(struct orangefs_kernel_op_s *op);
-int orangefs_flush_inode(struct inode *inode);
-
ssize_t orangefs_inode_getxattr(struct inode *inode,
const char *name,
void *buffer,
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index f82336496311..97fe93129f38 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -4,6 +4,7 @@
*
* See COPYING in top-level directory.
*/
+#include <linux/kernel.h>
#include "protocol.h"
#include "orangefs-kernel.h"
#include "orangefs-dev-proto.h"
@@ -437,89 +438,8 @@ int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr)
op_release(new_op);
- /*
- * successful setattr should clear the atime, mtime and
- * ctime flags.
- */
- if (ret == 0) {
- ClearAtimeFlag(orangefs_inode);
- ClearMtimeFlag(orangefs_inode);
- ClearCtimeFlag(orangefs_inode);
- ClearModeFlag(orangefs_inode);
+ if (ret == 0)
orangefs_inode->getattr_time = jiffies - 1;
- }
-
- return ret;
-}
-
-int orangefs_flush_inode(struct inode *inode)
-{
- /*
- * If it is a dirty inode, this function gets called.
- * Gather all the information that needs to be setattr'ed
- * Right now, this will only be used for mode, atime, mtime
- * and/or ctime.
- */
- struct iattr wbattr;
- int ret;
- int mtime_flag;
- int ctime_flag;
- int atime_flag;
- int mode_flag;
- struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
-
- memset(&wbattr, 0, sizeof(wbattr));
-
- /*
- * check inode flags up front, and clear them if they are set. This
- * will prevent multiple processes from all trying to flush the same
- * inode if they call close() simultaneously
- */
- mtime_flag = MtimeFlag(orangefs_inode);
- ClearMtimeFlag(orangefs_inode);
- ctime_flag = CtimeFlag(orangefs_inode);
- ClearCtimeFlag(orangefs_inode);
- atime_flag = AtimeFlag(orangefs_inode);
- ClearAtimeFlag(orangefs_inode);
- mode_flag = ModeFlag(orangefs_inode);
- ClearModeFlag(orangefs_inode);
-
- /* -- Lazy atime,mtime and ctime update --
- * Note: all times are dictated by server in the new scheme
- * and not by the clients
- *
- * Also mode updates are being handled now..
- */
-
- if (mtime_flag)
- wbattr.ia_valid |= ATTR_MTIME;
- if (ctime_flag)
- wbattr.ia_valid |= ATTR_CTIME;
- if (atime_flag)
- wbattr.ia_valid |= ATTR_ATIME;
-
- if (mode_flag) {
- wbattr.ia_mode = inode->i_mode;
- wbattr.ia_valid |= ATTR_MODE;
- }
-
- gossip_debug(GOSSIP_UTILS_DEBUG,
- "*********** orangefs_flush_inode: %pU "
- "(ia_valid %d)\n",
- get_khandle_from_ino(inode),
- wbattr.ia_valid);
- if (wbattr.ia_valid == 0) {
- gossip_debug(GOSSIP_UTILS_DEBUG,
- "orangefs_flush_inode skipping setattr()\n");
- return 0;
- }
-
- gossip_debug(GOSSIP_UTILS_DEBUG,
- "orangefs_flush_inode (%pU) writing mode %o\n",
- get_khandle_from_ino(inode),
- inode->i_mode);
-
- ret = orangefs_inode_setattr(inode, &wbattr);
return ret;
}
@@ -606,7 +526,7 @@ int orangefs_normalize_to_errno(__s32 error_code)
/* Convert ORANGEFS encoded errno values into regular errno values. */
} else if ((-error_code) & ORANGEFS_ERROR_BIT) {
i = (-error_code) & ~(ORANGEFS_ERROR_BIT|ORANGEFS_ERROR_CLASS_BITS);
- if (i < sizeof(PINT_errno_mapping)/sizeof(*PINT_errno_mapping))
+ if (i < ARRAY_SIZE(PINT_errno_mapping))
error_code = -PINT_errno_mapping[i];
else
error_code = -EINVAL;
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 47ebd9bfd1a1..366750eef201 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -99,8 +99,6 @@ static void orangefs_inode_cache_ctor(void *req)
inode_init_once(&orangefs_inode->vfs_inode);
init_rwsem(&orangefs_inode->xattr_sem);
-
- orangefs_inode->vfs_inode.i_version = 1;
}
static struct inode *orangefs_alloc_inode(struct super_block *sb)
@@ -119,7 +117,6 @@ static struct inode *orangefs_alloc_inode(struct super_block *sb)
orangefs_inode->refn.fs_id = ORANGEFS_FS_ID_NULL;
orangefs_inode->last_failed_block_index_read = 0;
memset(orangefs_inode->link_target, 0, sizeof(orangefs_inode->link_target));
- orangefs_inode->pinode_flags = 0;
gossip_debug(GOSSIP_SUPER_DEBUG,
"orangefs_alloc_inode: allocated %p\n",
@@ -299,21 +296,9 @@ void fsid_key_table_finalize(void)
{
}
-/* Called whenever the VFS dirties the inode in response to atime updates */
-static void orangefs_dirty_inode(struct inode *inode, int flags)
-{
- struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
-
- gossip_debug(GOSSIP_SUPER_DEBUG,
- "orangefs_dirty_inode: %pU\n",
- get_khandle_from_ino(inode));
- SetAtimeFlag(orangefs_inode);
-}
-
static const struct super_operations orangefs_s_ops = {
.alloc_inode = orangefs_alloc_inode,
.destroy_inode = orangefs_destroy_inode,
- .dirty_inode = orangefs_dirty_inode,
.drop_inode = generic_delete_inode,
.statfs = orangefs_statfs,
.remount_fs = orangefs_remount_fs,
diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c
index d856cdf91763..db107fe91ab3 100644
--- a/fs/orangefs/symlink.c
+++ b/fs/orangefs/symlink.c
@@ -15,4 +15,5 @@ const struct inode_operations orangefs_symlink_inode_operations = {
.getattr = orangefs_getattr,
.listxattr = orangefs_listxattr,
.permission = orangefs_permission,
+ .update_time = orangefs_update_time,
};
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index c441f9387a1b..eb3b8d39fb61 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -22,7 +22,6 @@
#include <linux/ratelimit.h>
#include <linux/exportfs.h>
#include "overlayfs.h"
-#include "ovl_entry.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
@@ -486,6 +485,7 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c)
{
struct inode *udir = c->destdir->d_inode;
+ struct inode *inode;
struct dentry *newdentry = NULL;
struct dentry *temp = NULL;
int err;
@@ -508,7 +508,11 @@ static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c)
if (err)
goto out_cleanup;
- ovl_inode_update(d_inode(c->dentry), newdentry);
+ inode = d_inode(c->dentry);
+ ovl_inode_update(inode, newdentry);
+ if (S_ISDIR(inode->i_mode))
+ ovl_set_flag(OVL_WHITEOUTS, inode);
+
out:
dput(temp);
return err;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index cc961a3bd3bd..e13921824c70 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -181,6 +181,11 @@ static bool ovl_type_origin(struct dentry *dentry)
return OVL_TYPE_ORIGIN(ovl_path_type(dentry));
}
+static bool ovl_may_have_whiteouts(struct dentry *dentry)
+{
+ return ovl_test_flag(OVL_WHITEOUTS, d_inode(dentry));
+}
+
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct cattr *attr, struct dentry *hardlink)
{
@@ -300,7 +305,6 @@ static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
{
int err;
struct dentry *ret = NULL;
- enum ovl_path_type type = ovl_path_type(dentry);
LIST_HEAD(list);
err = ovl_check_empty_dir(dentry, &list);
@@ -313,13 +317,13 @@ static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
* When removing an empty opaque directory, then it makes no sense to
* replace it with an exact replica of itself.
*
- * If no upperdentry then skip clearing whiteouts.
+ * If upperdentry has whiteouts, clear them.
*
* Can race with copy-up, since we don't hold the upperdir mutex.
* Doesn't matter, since copy-up can't create a non-empty directory
* from an empty one.
*/
- if (OVL_TYPE_UPPER(type) && OVL_TYPE_MERGE(type))
+ if (!list_empty(&list))
ret = ovl_clear_empty(dentry, &list);
out_free:
@@ -698,8 +702,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
struct dentry *opaquedir = NULL;
int err;
- /* Redirect dir can be !ovl_lower_positive && OVL_TYPE_MERGE */
- if (is_dir && ovl_dentry_get_redirect(dentry)) {
+ /* Redirect/origin dir can be !ovl_lower_positive && not clean */
+ if (is_dir && (ovl_dentry_get_redirect(dentry) ||
+ ovl_may_have_whiteouts(dentry))) {
opaquedir = ovl_check_empty_and_clear(dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
@@ -946,7 +951,8 @@ static int ovl_rename(struct inode *olddir, struct dentry *old,
old_cred = ovl_override_creds(old->d_sb);
- if (overwrite && new_is_dir && ovl_type_merge_or_lower(new)) {
+ if (overwrite && new_is_dir && (ovl_type_merge_or_lower(new) ||
+ ovl_may_have_whiteouts(new))) {
opaquedir = ovl_check_empty_and_clear(new);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir)) {
@@ -1069,9 +1075,10 @@ static int ovl_rename(struct inode *olddir, struct dentry *old,
drop_nlink(d_inode(new));
}
- ovl_dentry_version_inc(old->d_parent,
- !overwrite && ovl_type_origin(new));
- ovl_dentry_version_inc(new->d_parent, ovl_type_origin(old));
+ ovl_dentry_version_inc(old->d_parent, ovl_type_origin(old) ||
+ (!overwrite && ovl_type_origin(new)));
+ ovl_dentry_version_inc(new->d_parent, ovl_type_origin(old) ||
+ (d_inode(new) && ovl_type_origin(new)));
out_dput:
dput(newdentry);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 321511ed8c42..00b6b294272a 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -15,6 +15,14 @@
#include <linux/ratelimit.h>
#include "overlayfs.h"
+
+static dev_t ovl_get_pseudo_dev(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ return oe->lowerstack[0].layer->pseudo_dev;
+}
+
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
@@ -66,6 +74,7 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
struct path realpath;
const struct cred *old_cred;
bool is_dir = S_ISDIR(dentry->d_inode->i_mode);
+ bool samefs = ovl_same_sb(dentry->d_sb);
int err;
type = ovl_path_real(dentry, &realpath);
@@ -75,16 +84,13 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
goto out;
/*
- * When all layers are on the same fs, all real inode number are
- * unique, so we use the overlay st_dev, which is friendly to du -x.
- *
- * We also use st_ino of the copy up origin, if we know it.
- * This guaranties constant st_dev/st_ino across copy up.
+ * For non-dir or same fs, we use st_ino of the copy up origin, if we
+ * know it. This guaranties constant st_dev/st_ino across copy up.
*
* If filesystem supports NFS export ops, this also guaranties
* persistent st_ino across mount cycle.
*/
- if (ovl_same_sb(dentry->d_sb)) {
+ if (!is_dir || samefs) {
if (OVL_TYPE_ORIGIN(type)) {
struct kstat lowerstat;
u32 lowermask = STATX_INO | (!is_dir ? STATX_NLINK : 0);
@@ -95,7 +101,6 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
if (err)
goto out;
- WARN_ON_ONCE(stat->dev != lowerstat.dev);
/*
* Lower hardlinks may be broken on copy up to different
* upper files, so we cannot use the lower origin st_ino
@@ -107,17 +112,36 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
if (is_dir || lowerstat.nlink == 1 ||
ovl_test_flag(OVL_INDEX, d_inode(dentry)))
stat->ino = lowerstat.ino;
+
+ if (samefs)
+ WARN_ON_ONCE(stat->dev != lowerstat.dev);
+ else
+ stat->dev = ovl_get_pseudo_dev(dentry);
}
- stat->dev = dentry->d_sb->s_dev;
- } else if (is_dir) {
+ if (samefs) {
+ /*
+ * When all layers are on the same fs, all real inode
+ * number are unique, so we use the overlay st_dev,
+ * which is friendly to du -x.
+ */
+ stat->dev = dentry->d_sb->s_dev;
+ } else if (!OVL_TYPE_UPPER(type)) {
+ /*
+ * For non-samefs setup, to make sure that st_dev/st_ino
+ * pair is unique across the system, we use a unique
+ * anonymous st_dev for lower layer inode.
+ */
+ stat->dev = ovl_get_pseudo_dev(dentry);
+ }
+ } else {
/*
- * If not all layers are on the same fs the pair {real st_ino;
- * overlay st_dev} is not unique, so use the non persistent
- * overlay st_ino.
- *
* Always use the overlay st_dev for directories, so 'find
* -xdev' will scan the entire overlay mount and won't cross the
* overlay mount boundaries.
+ *
+ * If not all layers are on the same fs the pair {real st_ino;
+ * overlay st_dev} is not unique, so use the non persistent
+ * overlay st_ino for directories.
*/
stat->dev = dentry->d_sb->s_dev;
stat->ino = dentry->d_inode->i_ino;
@@ -409,6 +433,7 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode)
#ifdef CONFIG_LOCKDEP
static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING];
static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING];
+ static struct lock_class_key ovl_i_lock_key[OVL_MAX_NESTING];
int depth = inode->i_sb->s_stack_depth - 1;
@@ -419,6 +444,8 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode)
lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]);
else
lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]);
+
+ lockdep_set_class(&OVL_I(inode)->lock, &ovl_i_lock_key[depth]);
#endif
}
@@ -657,6 +684,16 @@ struct inode *ovl_get_inode(struct dentry *dentry, struct dentry *upperdentry,
if (upperdentry && ovl_is_impuredir(upperdentry))
ovl_set_flag(OVL_IMPURE, inode);
+ /* Check for non-merge dir that may have whiteouts */
+ if (S_ISDIR(realinode->i_mode)) {
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ if (((upperdentry && lowerdentry) || oe->numlower > 1) ||
+ ovl_check_origin_xattr(upperdentry ?: lowerdentry)) {
+ ovl_set_flag(OVL_WHITEOUTS, inode);
+ }
+ }
+
if (inode->i_state & I_NEW)
unlock_new_inode(inode);
out:
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index a12dc10bf726..625ed8066570 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -15,7 +15,6 @@
#include <linux/mount.h>
#include <linux/exportfs.h>
#include "overlayfs.h"
-#include "ovl_entry.h"
struct ovl_lookup_data {
struct qstr name;
@@ -286,16 +285,15 @@ static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d,
static int ovl_check_origin(struct dentry *upperdentry,
- struct path *lowerstack, unsigned int numlower,
- struct path **stackp, unsigned int *ctrp)
+ struct ovl_path *lower, unsigned int numlower,
+ struct ovl_path **stackp, unsigned int *ctrp)
{
struct vfsmount *mnt;
struct dentry *origin = NULL;
int i;
-
for (i = 0; i < numlower; i++) {
- mnt = lowerstack[i].mnt;
+ mnt = lower[i].layer->mnt;
origin = ovl_get_origin(upperdentry, mnt);
if (IS_ERR(origin))
return PTR_ERR(origin);
@@ -309,12 +307,12 @@ static int ovl_check_origin(struct dentry *upperdentry,
BUG_ON(*ctrp);
if (!*stackp)
- *stackp = kmalloc(sizeof(struct path), GFP_KERNEL);
+ *stackp = kmalloc(sizeof(struct ovl_path), GFP_KERNEL);
if (!*stackp) {
dput(origin);
return -ENOMEM;
}
- **stackp = (struct path) { .dentry = origin, .mnt = mnt };
+ **stackp = (struct ovl_path){.dentry = origin, .layer = lower[i].layer};
*ctrp = 1;
return 0;
@@ -350,8 +348,8 @@ static int ovl_verify_origin_fh(struct dentry *dentry, const struct ovl_fh *fh)
*
* Return 0 on match, -ESTALE on mismatch, < 0 on error.
*/
-int ovl_verify_origin(struct dentry *dentry, struct vfsmount *mnt,
- struct dentry *origin, bool is_upper, bool set)
+int ovl_verify_origin(struct dentry *dentry, struct dentry *origin,
+ bool is_upper, bool set)
{
struct inode *inode;
struct ovl_fh *fh;
@@ -384,13 +382,13 @@ fail:
* OVL_XATTR_ORIGIN and that origin file handle can be decoded to lower path.
* Return 0 on match, -ESTALE on mismatch or stale origin, < 0 on error.
*/
-int ovl_verify_index(struct dentry *index, struct path *lowerstack,
+int ovl_verify_index(struct dentry *index, struct ovl_path *lower,
unsigned int numlower)
{
struct ovl_fh *fh = NULL;
size_t len;
- struct path origin = { };
- struct path *stack = &origin;
+ struct ovl_path origin = { };
+ struct ovl_path *stack = &origin;
unsigned int ctr = 0;
int err;
@@ -429,7 +427,7 @@ int ovl_verify_index(struct dentry *index, struct path *lowerstack,
if (err)
goto fail;
- err = ovl_check_origin(index, lowerstack, numlower, &stack, &ctr);
+ err = ovl_check_origin(index, lower, numlower, &stack, &ctr);
if (!err && !ctr)
err = -ESTALE;
if (err)
@@ -568,11 +566,24 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
idx++;
}
BUG_ON(idx > oe->numlower);
- *path = oe->lowerstack[idx - 1];
+ path->dentry = oe->lowerstack[idx - 1].dentry;
+ path->mnt = oe->lowerstack[idx - 1].layer->mnt;
return (idx < oe->numlower) ? idx + 1 : -1;
}
+static int ovl_find_layer(struct ovl_fs *ofs, struct ovl_path *path)
+{
+ int i;
+
+ for (i = 0; i < ofs->numlower; i++) {
+ if (ofs->lower_layers[i].mnt == path->layer->mnt)
+ break;
+ }
+
+ return i;
+}
+
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
@@ -581,7 +592,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
struct ovl_entry *poe = dentry->d_parent->d_fsdata;
struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata;
- struct path *stack = NULL;
+ struct ovl_path *stack = NULL;
struct dentry *upperdir, *upperdentry = NULL;
struct dentry *index = NULL;
unsigned int ctr = 0;
@@ -630,7 +641,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
err = ovl_check_origin(upperdentry, roe->lowerstack,
roe->numlower, &stack, &ctr);
if (err)
- goto out;
+ goto out_put_upper;
}
if (d.redirect) {
@@ -646,17 +657,17 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (!d.stop && poe->numlower) {
err = -ENOMEM;
- stack = kcalloc(ofs->numlower, sizeof(struct path),
+ stack = kcalloc(ofs->numlower, sizeof(struct ovl_path),
GFP_KERNEL);
if (!stack)
goto out_put_upper;
}
for (i = 0; !d.stop && i < poe->numlower; i++) {
- struct path lowerpath = poe->lowerstack[i];
+ struct ovl_path lower = poe->lowerstack[i];
d.last = i == poe->numlower - 1;
- err = ovl_lookup_layer(lowerpath.dentry, &d, &this);
+ err = ovl_lookup_layer(lower.dentry, &d, &this);
if (err)
goto out_put;
@@ -664,7 +675,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
continue;
stack[ctr].dentry = this;
- stack[ctr].mnt = lowerpath.mnt;
+ stack[ctr].layer = lower.layer;
ctr++;
if (d.stop)
@@ -674,10 +685,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
poe = roe;
/* Find the current layer on the root dentry */
- for (i = 0; i < poe->numlower; i++)
- if (poe->lowerstack[i].mnt == lowerpath.mnt)
- break;
- if (WARN_ON(i == poe->numlower))
+ i = ovl_find_layer(ofs, &lower);
+ if (WARN_ON(i == ofs->numlower))
break;
}
}
@@ -700,7 +709,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
goto out_put;
oe->opaque = upperopaque;
- memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr);
+ memcpy(oe->lowerstack, stack, sizeof(struct ovl_path) * ctr);
dentry->d_fsdata = oe;
if (upperdentry)
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index d9a0edd4e57e..13eab09a6b6f 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -9,6 +9,7 @@
#include <linux/kernel.h>
#include <linux/uuid.h>
+#include "ovl_entry.h"
enum ovl_path_type {
__OVL_PATH_UPPER = (1 << 0),
@@ -28,7 +29,10 @@ enum ovl_path_type {
#define OVL_XATTR_NLINK OVL_XATTR_PREFIX "nlink"
enum ovl_flag {
+ /* Pure upper dir that may contain non pure upper entries */
OVL_IMPURE,
+ /* Non-merge dir that may contain whiteout entries */
+ OVL_WHITEOUTS,
OVL_INDEX,
};
@@ -223,6 +227,7 @@ bool ovl_is_whiteout(struct dentry *dentry);
struct file *ovl_path_open(struct path *path, int flags);
int ovl_copy_up_start(struct dentry *dentry);
void ovl_copy_up_end(struct dentry *dentry);
+bool ovl_check_origin_xattr(struct dentry *dentry);
bool ovl_check_dir_xattr(struct dentry *dentry, const char *name);
int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry,
const char *name, const void *value, size_t size,
@@ -244,9 +249,9 @@ static inline bool ovl_is_impuredir(struct dentry *dentry)
/* namei.c */
-int ovl_verify_origin(struct dentry *dentry, struct vfsmount *mnt,
- struct dentry *origin, bool is_upper, bool set);
-int ovl_verify_index(struct dentry *index, struct path *lowerstack,
+int ovl_verify_origin(struct dentry *dentry, struct dentry *origin,
+ bool is_upper, bool set);
+int ovl_verify_index(struct dentry *index, struct ovl_path *lower,
unsigned int numlower);
int ovl_get_index_name(struct dentry *origin, struct qstr *name);
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
@@ -263,7 +268,7 @@ int ovl_check_d_type_supported(struct path *realpath);
void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
struct dentry *dentry, int level);
int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt,
- struct path *lowerstack, unsigned int numlower);
+ struct ovl_path *lower, unsigned int numlower);
/* inode.c */
int ovl_set_nlink_upper(struct dentry *dentry);
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 36b49bd09264..752bab645879 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -17,11 +17,21 @@ struct ovl_config {
bool index;
};
+struct ovl_layer {
+ struct vfsmount *mnt;
+ dev_t pseudo_dev;
+};
+
+struct ovl_path {
+ struct ovl_layer *layer;
+ struct dentry *dentry;
+};
+
/* private information held for overlayfs's superblock */
struct ovl_fs {
struct vfsmount *upper_mnt;
unsigned numlower;
- struct vfsmount **lower_mnt;
+ struct ovl_layer *lower_layers;
/* workbasedir is the path at workdir= mount option */
struct dentry *workbasedir;
/* workdir is the 'work' directory under workbasedir */
@@ -52,7 +62,7 @@ struct ovl_entry {
struct rcu_head rcu;
};
unsigned numlower;
- struct path lowerstack[];
+ struct ovl_path lowerstack[];
};
struct ovl_entry *ovl_alloc_entry(unsigned int numlower);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index c310e3ff7f3f..0daa4354fec4 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -26,6 +26,7 @@ struct ovl_cache_entry {
struct list_head l_node;
struct rb_node node;
struct ovl_cache_entry *next_maybe_whiteout;
+ bool is_upper;
bool is_whiteout;
char name[];
};
@@ -158,6 +159,7 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
/* Defer setting d_ino for upper entry to ovl_iterate() */
if (ovl_calc_d_ino(rdd, p))
p->ino = 0;
+ p->is_upper = rdd->is_upper;
p->is_whiteout = false;
if (d_type == DT_CHR) {
@@ -316,21 +318,37 @@ static inline int ovl_dir_read(struct path *realpath,
return err;
}
+/*
+ * Can we iterate real dir directly?
+ *
+ * Non-merge dir may contain whiteouts from a time it was a merge upper, before
+ * lower dir was removed under it and possibly before it was rotated from upper
+ * to lower layer.
+ */
+static bool ovl_dir_is_real(struct dentry *dir)
+{
+ return !ovl_test_flag(OVL_WHITEOUTS, d_inode(dir));
+}
+
static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
struct ovl_dir_cache *cache = od->cache;
struct dentry *dentry = file->f_path.dentry;
- enum ovl_path_type type = ovl_path_type(dentry);
+ bool is_real;
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
ovl_cache_put(od, dentry);
od->cache = NULL;
od->cursor = NULL;
}
- WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
- if (od->is_real && OVL_TYPE_MERGE(type))
+ is_real = ovl_dir_is_real(dentry);
+ if (od->is_real != is_real) {
+ /* is_real can only become false when dir is copied up */
+ if (WARN_ON(is_real))
+ return;
od->is_real = false;
+ }
}
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list,
@@ -816,7 +834,7 @@ static int ovl_dir_open(struct inode *inode, struct file *file)
return PTR_ERR(realfile);
}
od->realfile = realfile;
- od->is_real = !OVL_TYPE_MERGE(type);
+ od->is_real = ovl_dir_is_real(file->f_path.dentry);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
@@ -835,7 +853,7 @@ const struct file_operations ovl_dir_operations = {
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
{
int err;
- struct ovl_cache_entry *p;
+ struct ovl_cache_entry *p, *n;
struct rb_root root = RB_ROOT;
err = ovl_dir_read_merged(dentry, list, &root);
@@ -844,18 +862,29 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
err = 0;
- list_for_each_entry(p, list, l_node) {
- if (p->is_whiteout)
- continue;
+ list_for_each_entry_safe(p, n, list, l_node) {
+ /*
+ * Select whiteouts in upperdir, they should
+ * be cleared when deleting this directory.
+ */
+ if (p->is_whiteout) {
+ if (p->is_upper)
+ continue;
+ goto del_entry;
+ }
if (p->name[0] == '.') {
if (p->len == 1)
- continue;
+ goto del_entry;
if (p->len == 2 && p->name[1] == '.')
- continue;
+ goto del_entry;
}
err = -ENOTEMPTY;
break;
+
+del_entry:
+ list_del(&p->l_node);
+ kfree(p);
}
return err;
@@ -869,7 +898,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
- if (!p->is_whiteout)
+ if (WARN_ON(!p->is_whiteout || !p->is_upper))
continue;
dentry = lookup_one_len(p->name, upper, p->len);
@@ -985,7 +1014,7 @@ void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
}
int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt,
- struct path *lowerstack, unsigned int numlower)
+ struct ovl_path *lower, unsigned int numlower)
{
int err;
struct dentry *index = NULL;
@@ -1020,7 +1049,7 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt,
index = NULL;
break;
}
- err = ovl_verify_index(index, lowerstack, numlower);
+ err = ovl_verify_index(index, lower, numlower);
/* Cleanup stale and orphan index entries */
if (err && (err == -ESTALE || err == -ENOENT))
err = ovl_cleanup(dir, index);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index f5738e96a052..be03578181d2 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -18,7 +18,6 @@
#include <linux/seq_file.h>
#include <linux/posix_acl_xattr.h>
#include "overlayfs.h"
-#include "ovl_entry.h"
MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
MODULE_DESCRIPTION("Overlay filesystem");
@@ -39,15 +38,20 @@ module_param_named(index, ovl_index_def, bool, 0644);
MODULE_PARM_DESC(ovl_index_def,
"Default to on or off for the inodes index feature");
+static void ovl_entry_stack_free(struct ovl_entry *oe)
+{
+ unsigned int i;
+
+ for (i = 0; i < oe->numlower; i++)
+ dput(oe->lowerstack[i].dentry);
+}
+
static void ovl_dentry_release(struct dentry *dentry)
{
struct ovl_entry *oe = dentry->d_fsdata;
if (oe) {
- unsigned int i;
-
- for (i = 0; i < oe->numlower; i++)
- dput(oe->lowerstack[i].dentry);
+ ovl_entry_stack_free(oe);
kfree_rcu(oe, rcu);
}
}
@@ -207,39 +211,48 @@ static void ovl_destroy_inode(struct inode *inode)
call_rcu(&inode->i_rcu, ovl_i_callback);
}
-static void ovl_put_super(struct super_block *sb)
+static void ovl_free_fs(struct ovl_fs *ofs)
{
- struct ovl_fs *ufs = sb->s_fs_info;
unsigned i;
- dput(ufs->indexdir);
- dput(ufs->workdir);
- if (ufs->workdir_locked)
- ovl_inuse_unlock(ufs->workbasedir);
- dput(ufs->workbasedir);
- if (ufs->upper_mnt && ufs->upperdir_locked)
- ovl_inuse_unlock(ufs->upper_mnt->mnt_root);
- mntput(ufs->upper_mnt);
- for (i = 0; i < ufs->numlower; i++)
- mntput(ufs->lower_mnt[i]);
- kfree(ufs->lower_mnt);
-
- kfree(ufs->config.lowerdir);
- kfree(ufs->config.upperdir);
- kfree(ufs->config.workdir);
- put_cred(ufs->creator_cred);
- kfree(ufs);
+ dput(ofs->indexdir);
+ dput(ofs->workdir);
+ if (ofs->workdir_locked)
+ ovl_inuse_unlock(ofs->workbasedir);
+ dput(ofs->workbasedir);
+ if (ofs->upperdir_locked)
+ ovl_inuse_unlock(ofs->upper_mnt->mnt_root);
+ mntput(ofs->upper_mnt);
+ for (i = 0; i < ofs->numlower; i++) {
+ mntput(ofs->lower_layers[i].mnt);
+ free_anon_bdev(ofs->lower_layers[i].pseudo_dev);
+ }
+ kfree(ofs->lower_layers);
+
+ kfree(ofs->config.lowerdir);
+ kfree(ofs->config.upperdir);
+ kfree(ofs->config.workdir);
+ if (ofs->creator_cred)
+ put_cred(ofs->creator_cred);
+ kfree(ofs);
+}
+
+static void ovl_put_super(struct super_block *sb)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ ovl_free_fs(ofs);
}
static int ovl_sync_fs(struct super_block *sb, int wait)
{
- struct ovl_fs *ufs = sb->s_fs_info;
+ struct ovl_fs *ofs = sb->s_fs_info;
struct super_block *upper_sb;
int ret;
- if (!ufs->upper_mnt)
+ if (!ofs->upper_mnt)
return 0;
- upper_sb = ufs->upper_mnt->mnt_sb;
+ upper_sb = ofs->upper_mnt->mnt_sb;
if (!upper_sb->s_op->sync_fs)
return 0;
@@ -277,9 +290,9 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
}
/* Will this overlay be forced to mount/remount ro? */
-static bool ovl_force_readonly(struct ovl_fs *ufs)
+static bool ovl_force_readonly(struct ovl_fs *ofs)
{
- return (!ufs->upper_mnt || !ufs->workdir);
+ return (!ofs->upper_mnt || !ofs->workdir);
}
/**
@@ -291,29 +304,29 @@ static bool ovl_force_readonly(struct ovl_fs *ufs)
static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
{
struct super_block *sb = dentry->d_sb;
- struct ovl_fs *ufs = sb->s_fs_info;
+ struct ovl_fs *ofs = sb->s_fs_info;
- seq_show_option(m, "lowerdir", ufs->config.lowerdir);
- if (ufs->config.upperdir) {
- seq_show_option(m, "upperdir", ufs->config.upperdir);
- seq_show_option(m, "workdir", ufs->config.workdir);
+ seq_show_option(m, "lowerdir", ofs->config.lowerdir);
+ if (ofs->config.upperdir) {
+ seq_show_option(m, "upperdir", ofs->config.upperdir);
+ seq_show_option(m, "workdir", ofs->config.workdir);
}
- if (ufs->config.default_permissions)
+ if (ofs->config.default_permissions)
seq_puts(m, ",default_permissions");
- if (ufs->config.redirect_dir != ovl_redirect_dir_def)
+ if (ofs->config.redirect_dir != ovl_redirect_dir_def)
seq_printf(m, ",redirect_dir=%s",
- ufs->config.redirect_dir ? "on" : "off");
- if (ufs->config.index != ovl_index_def)
+ ofs->config.redirect_dir ? "on" : "off");
+ if (ofs->config.index != ovl_index_def)
seq_printf(m, ",index=%s",
- ufs->config.index ? "on" : "off");
+ ofs->config.index ? "on" : "off");
return 0;
}
static int ovl_remount(struct super_block *sb, int *flags, char *data)
{
- struct ovl_fs *ufs = sb->s_fs_info;
+ struct ovl_fs *ofs = sb->s_fs_info;
- if (!(*flags & MS_RDONLY) && ovl_force_readonly(ufs))
+ if (!(*flags & MS_RDONLY) && ovl_force_readonly(ofs))
return -EROFS;
return 0;
@@ -451,13 +464,11 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
#define OVL_WORKDIR_NAME "work"
#define OVL_INDEXDIR_NAME "index"
-static struct dentry *ovl_workdir_create(struct super_block *sb,
- struct ovl_fs *ufs,
- struct dentry *dentry,
+static struct dentry *ovl_workdir_create(struct ovl_fs *ofs,
const char *name, bool persist)
{
- struct inode *dir = dentry->d_inode;
- struct vfsmount *mnt = ufs->upper_mnt;
+ struct inode *dir = ofs->workbasedir->d_inode;
+ struct vfsmount *mnt = ofs->upper_mnt;
struct dentry *work;
int err;
bool retried = false;
@@ -471,7 +482,7 @@ static struct dentry *ovl_workdir_create(struct super_block *sb,
locked = true;
retry:
- work = lookup_one_len(name, dentry, strlen(name));
+ work = lookup_one_len(name, ofs->workbasedir, strlen(name));
if (!IS_ERR(work)) {
struct iattr attr = {
@@ -541,8 +552,7 @@ out_dput:
dput(work);
out_err:
pr_warn("overlayfs: failed to create directory %s/%s (errno: %i); mounting read-only\n",
- ufs->config.workdir, name, -err);
- sb->s_flags |= MS_RDONLY;
+ ofs->config.workdir, name, -err);
work = NULL;
goto out_unlock;
}
@@ -585,7 +595,7 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path)
return 0;
out_put:
- path_put(path);
+ path_put_init(path);
out:
return err;
}
@@ -603,7 +613,7 @@ static int ovl_mount_dir(const char *name, struct path *path)
if (ovl_dentry_remote(path->dentry)) {
pr_err("overlayfs: filesystem on '%s' not supported as upperdir\n",
tmp);
- path_put(path);
+ path_put_init(path);
err = -EINVAL;
}
kfree(tmp);
@@ -655,7 +665,7 @@ static int ovl_lower_dir(const char *name, struct path *path,
return 0;
out_put:
- path_put(path);
+ path_put_init(path);
out:
return err;
}
@@ -826,129 +836,269 @@ static const struct xattr_handler *ovl_xattr_handlers[] = {
NULL
};
-static int ovl_fill_super(struct super_block *sb, void *data, int silent)
+static int ovl_get_upper(struct ovl_fs *ofs, struct path *upperpath)
{
- struct path upperpath = { };
- struct path workpath = { };
- struct dentry *root_dentry;
- struct ovl_entry *oe;
- struct ovl_fs *ufs;
- struct path *stack = NULL;
- char *lowertmp;
- char *lower;
- unsigned int numlower;
- unsigned int stacklen = 0;
- unsigned int i;
- bool remote = false;
- struct cred *cred;
+ struct vfsmount *upper_mnt;
int err;
- err = -ENOMEM;
- ufs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
- if (!ufs)
+ err = ovl_mount_dir(ofs->config.upperdir, upperpath);
+ if (err)
goto out;
- ufs->config.redirect_dir = ovl_redirect_dir_def;
- ufs->config.index = ovl_index_def;
- err = ovl_parse_opt((char *) data, &ufs->config);
+ /* Upper fs should not be r/o */
+ if (sb_rdonly(upperpath->mnt->mnt_sb)) {
+ pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n");
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = ovl_check_namelen(upperpath, ofs, ofs->config.upperdir);
if (err)
- goto out_free_config;
+ goto out;
+
+ err = -EBUSY;
+ if (ovl_inuse_trylock(upperpath->dentry)) {
+ ofs->upperdir_locked = true;
+ } else if (ofs->config.index) {
+ pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n");
+ goto out;
+ } else {
+ pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
+ }
+
+ upper_mnt = clone_private_mount(upperpath);
+ err = PTR_ERR(upper_mnt);
+ if (IS_ERR(upper_mnt)) {
+ pr_err("overlayfs: failed to clone upperpath\n");
+ goto out;
+ }
+
+ /* Don't inherit atime flags */
+ upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME);
+ ofs->upper_mnt = upper_mnt;
+ err = 0;
+out:
+ return err;
+}
+
+static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath)
+{
+ struct dentry *temp;
+ int err;
+
+ ofs->workdir = ovl_workdir_create(ofs, OVL_WORKDIR_NAME, false);
+ if (!ofs->workdir)
+ return 0;
+
+ /*
+ * Upper should support d_type, else whiteouts are visible. Given
+ * workdir and upper are on same fs, we can do iterate_dir() on
+ * workdir. This check requires successful creation of workdir in
+ * previous step.
+ */
+ err = ovl_check_d_type_supported(workpath);
+ if (err < 0)
+ return err;
+
+ /*
+ * We allowed this configuration and don't want to break users over
+ * kernel upgrade. So warn instead of erroring out.
+ */
+ if (!err)
+ pr_warn("overlayfs: upper fs needs to support d_type.\n");
+
+ /* Check if upper/work fs supports O_TMPFILE */
+ temp = ovl_do_tmpfile(ofs->workdir, S_IFREG | 0);
+ ofs->tmpfile = !IS_ERR(temp);
+ if (ofs->tmpfile)
+ dput(temp);
+ else
+ pr_warn("overlayfs: upper fs does not support tmpfile.\n");
+
+ /*
+ * Check if upper/work fs supports trusted.overlay.* xattr
+ */
+ err = ovl_do_setxattr(ofs->workdir, OVL_XATTR_OPAQUE, "0", 1, 0);
+ if (err) {
+ ofs->noxattr = true;
+ pr_warn("overlayfs: upper fs does not support xattr.\n");
+ } else {
+ vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE);
+ }
+
+ /* Check if upper/work fs supports file handles */
+ if (ofs->config.index &&
+ !ovl_can_decode_fh(ofs->workdir->d_sb)) {
+ ofs->config.index = false;
+ pr_warn("overlayfs: upper fs does not support file handles, falling back to index=off.\n");
+ }
+
+ return 0;
+}
+
+static int ovl_get_workdir(struct ovl_fs *ofs, struct path *upperpath)
+{
+ int err;
+ struct path workpath = { };
+
+ err = ovl_mount_dir(ofs->config.workdir, &workpath);
+ if (err)
+ goto out;
err = -EINVAL;
- if (!ufs->config.lowerdir) {
- if (!silent)
- pr_err("overlayfs: missing 'lowerdir'\n");
- goto out_free_config;
+ if (upperpath->mnt != workpath.mnt) {
+ pr_err("overlayfs: workdir and upperdir must reside under the same mount\n");
+ goto out;
+ }
+ if (!ovl_workdir_ok(workpath.dentry, upperpath->dentry)) {
+ pr_err("overlayfs: workdir and upperdir must be separate subtrees\n");
+ goto out;
}
- sb->s_stack_depth = 0;
- sb->s_maxbytes = MAX_LFS_FILESIZE;
- if (ufs->config.upperdir) {
- if (!ufs->config.workdir) {
- pr_err("overlayfs: missing 'workdir'\n");
- goto out_free_config;
- }
+ err = -EBUSY;
+ if (ovl_inuse_trylock(workpath.dentry)) {
+ ofs->workdir_locked = true;
+ } else if (ofs->config.index) {
+ pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n");
+ goto out;
+ } else {
+ pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
+ }
- err = ovl_mount_dir(ufs->config.upperdir, &upperpath);
- if (err)
- goto out_free_config;
+ ofs->workbasedir = dget(workpath.dentry);
+ err = ovl_make_workdir(ofs, &workpath);
+ if (err)
+ goto out;
- /* Upper fs should not be r/o */
- if (sb_rdonly(upperpath.mnt->mnt_sb)) {
- pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n");
- err = -EINVAL;
- goto out_put_upperpath;
- }
+ err = 0;
+out:
+ path_put(&workpath);
- err = ovl_check_namelen(&upperpath, ufs, ufs->config.upperdir);
- if (err)
- goto out_put_upperpath;
-
- err = -EBUSY;
- if (ovl_inuse_trylock(upperpath.dentry)) {
- ufs->upperdir_locked = true;
- } else if (ufs->config.index) {
- pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n");
- goto out_put_upperpath;
- } else {
- pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
- }
+ return err;
+}
+
+static int ovl_get_indexdir(struct ovl_fs *ofs, struct ovl_entry *oe,
+ struct path *upperpath)
+{
+ int err;
- err = ovl_mount_dir(ufs->config.workdir, &workpath);
+ /* Verify lower root is upper root origin */
+ err = ovl_verify_origin(upperpath->dentry, oe->lowerstack[0].dentry,
+ false, true);
+ if (err) {
+ pr_err("overlayfs: failed to verify upper root origin\n");
+ goto out;
+ }
+
+ ofs->indexdir = ovl_workdir_create(ofs, OVL_INDEXDIR_NAME, true);
+ if (ofs->indexdir) {
+ /* Verify upper root is index dir origin */
+ err = ovl_verify_origin(ofs->indexdir, upperpath->dentry,
+ true, true);
if (err)
- goto out_unlock_upperdentry;
+ pr_err("overlayfs: failed to verify index dir origin\n");
- err = -EINVAL;
- if (upperpath.mnt != workpath.mnt) {
- pr_err("overlayfs: workdir and upperdir must reside under the same mount\n");
- goto out_put_workpath;
- }
- if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) {
- pr_err("overlayfs: workdir and upperdir must be separate subtrees\n");
- goto out_put_workpath;
+ /* Cleanup bad/stale/orphan index entries */
+ if (!err)
+ err = ovl_indexdir_cleanup(ofs->indexdir,
+ ofs->upper_mnt,
+ oe->lowerstack,
+ oe->numlower);
+ }
+ if (err || !ofs->indexdir)
+ pr_warn("overlayfs: try deleting index dir or mounting with '-o index=off' to disable inodes index.\n");
+
+out:
+ return err;
+}
+
+static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack,
+ unsigned int numlower)
+{
+ int err;
+ unsigned int i;
+
+ err = -ENOMEM;
+ ofs->lower_layers = kcalloc(numlower, sizeof(struct ovl_layer),
+ GFP_KERNEL);
+ if (ofs->lower_layers == NULL)
+ goto out;
+ for (i = 0; i < numlower; i++) {
+ struct vfsmount *mnt;
+ dev_t dev;
+
+ err = get_anon_bdev(&dev);
+ if (err) {
+ pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n");
+ goto out;
}
- err = -EBUSY;
- if (ovl_inuse_trylock(workpath.dentry)) {
- ufs->workdir_locked = true;
- } else if (ufs->config.index) {
- pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n");
- goto out_put_workpath;
- } else {
- pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
+ mnt = clone_private_mount(&stack[i]);
+ err = PTR_ERR(mnt);
+ if (IS_ERR(mnt)) {
+ pr_err("overlayfs: failed to clone lowerpath\n");
+ free_anon_bdev(dev);
+ goto out;
}
+ /*
+ * Make lower layers R/O. That way fchmod/fchown on lower file
+ * will fail instead of modifying lower fs.
+ */
+ mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME;
- ufs->workbasedir = workpath.dentry;
- sb->s_stack_depth = upperpath.mnt->mnt_sb->s_stack_depth;
+ ofs->lower_layers[ofs->numlower].mnt = mnt;
+ ofs->lower_layers[ofs->numlower].pseudo_dev = dev;
+ ofs->numlower++;
+
+ /* Check if all lower layers are on same sb */
+ if (i == 0)
+ ofs->same_sb = mnt->mnt_sb;
+ else if (ofs->same_sb != mnt->mnt_sb)
+ ofs->same_sb = NULL;
}
+ err = 0;
+out:
+ return err;
+}
+
+static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
+ struct ovl_fs *ofs)
+{
+ int err;
+ char *lowertmp, *lower;
+ struct path *stack = NULL;
+ unsigned int stacklen, numlower = 0, i;
+ bool remote = false;
+ struct ovl_entry *oe;
+
err = -ENOMEM;
- lowertmp = kstrdup(ufs->config.lowerdir, GFP_KERNEL);
+ lowertmp = kstrdup(ofs->config.lowerdir, GFP_KERNEL);
if (!lowertmp)
- goto out_unlock_workdentry;
+ goto out_err;
err = -EINVAL;
stacklen = ovl_split_lowerdirs(lowertmp);
if (stacklen > OVL_MAX_STACK) {
pr_err("overlayfs: too many lower directories, limit is %d\n",
OVL_MAX_STACK);
- goto out_free_lowertmp;
- } else if (!ufs->config.upperdir && stacklen == 1) {
+ goto out_err;
+ } else if (!ofs->config.upperdir && stacklen == 1) {
pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n");
- goto out_free_lowertmp;
+ goto out_err;
}
err = -ENOMEM;
stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL);
if (!stack)
- goto out_free_lowertmp;
+ goto out_err;
err = -EINVAL;
lower = lowertmp;
for (numlower = 0; numlower < stacklen; numlower++) {
- err = ovl_lower_dir(lower, &stack[numlower], ufs,
+ err = ovl_lower_dir(lower, &stack[numlower], ofs,
&sb->s_stack_depth, &remote);
if (err)
- goto out_put_lowerpath;
+ goto out_err;
lower = strchr(lower, '\0') + 1;
}
@@ -957,190 +1107,144 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_stack_depth++;
if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
pr_err("overlayfs: maximum fs stacking depth exceeded\n");
- goto out_put_lowerpath;
+ goto out_err;
}
- if (ufs->config.upperdir) {
- ufs->upper_mnt = clone_private_mount(&upperpath);
- err = PTR_ERR(ufs->upper_mnt);
- if (IS_ERR(ufs->upper_mnt)) {
- pr_err("overlayfs: failed to clone upperpath\n");
- goto out_put_lowerpath;
- }
+ err = ovl_get_lower_layers(ofs, stack, numlower);
+ if (err)
+ goto out_err;
+
+ err = -ENOMEM;
+ oe = ovl_alloc_entry(numlower);
+ if (!oe)
+ goto out_err;
+
+ for (i = 0; i < numlower; i++) {
+ oe->lowerstack[i].dentry = dget(stack[i].dentry);
+ oe->lowerstack[i].layer = &ofs->lower_layers[i];
+ }
- /* Don't inherit atime flags */
- ufs->upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME);
+ if (remote)
+ sb->s_d_op = &ovl_reval_dentry_operations;
+ else
+ sb->s_d_op = &ovl_dentry_operations;
- sb->s_time_gran = ufs->upper_mnt->mnt_sb->s_time_gran;
+out:
+ for (i = 0; i < numlower; i++)
+ path_put(&stack[i]);
+ kfree(stack);
+ kfree(lowertmp);
- ufs->workdir = ovl_workdir_create(sb, ufs, workpath.dentry,
- OVL_WORKDIR_NAME, false);
- /*
- * Upper should support d_type, else whiteouts are visible.
- * Given workdir and upper are on same fs, we can do
- * iterate_dir() on workdir. This check requires successful
- * creation of workdir in previous step.
- */
- if (ufs->workdir) {
- struct dentry *temp;
-
- err = ovl_check_d_type_supported(&workpath);
- if (err < 0)
- goto out_put_workdir;
-
- /*
- * We allowed this configuration and don't want to
- * break users over kernel upgrade. So warn instead
- * of erroring out.
- */
- if (!err)
- pr_warn("overlayfs: upper fs needs to support d_type.\n");
-
- /* Check if upper/work fs supports O_TMPFILE */
- temp = ovl_do_tmpfile(ufs->workdir, S_IFREG | 0);
- ufs->tmpfile = !IS_ERR(temp);
- if (ufs->tmpfile)
- dput(temp);
- else
- pr_warn("overlayfs: upper fs does not support tmpfile.\n");
-
- /*
- * Check if upper/work fs supports trusted.overlay.*
- * xattr
- */
- err = ovl_do_setxattr(ufs->workdir, OVL_XATTR_OPAQUE,
- "0", 1, 0);
- if (err) {
- ufs->noxattr = true;
- pr_warn("overlayfs: upper fs does not support xattr.\n");
- } else {
- vfs_removexattr(ufs->workdir, OVL_XATTR_OPAQUE);
- }
+ return oe;
- /* Check if upper/work fs supports file handles */
- if (ufs->config.index &&
- !ovl_can_decode_fh(ufs->workdir->d_sb)) {
- ufs->config.index = false;
- pr_warn("overlayfs: upper fs does not support file handles, falling back to index=off.\n");
- }
- }
- }
+out_err:
+ oe = ERR_PTR(err);
+ goto out;
+}
+
+static int ovl_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct path upperpath = { };
+ struct dentry *root_dentry;
+ struct ovl_entry *oe;
+ struct ovl_fs *ofs;
+ struct cred *cred;
+ int err;
err = -ENOMEM;
- ufs->lower_mnt = kcalloc(numlower, sizeof(struct vfsmount *), GFP_KERNEL);
- if (ufs->lower_mnt == NULL)
- goto out_put_workdir;
- for (i = 0; i < numlower; i++) {
- struct vfsmount *mnt = clone_private_mount(&stack[i]);
+ ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
+ if (!ofs)
+ goto out;
- err = PTR_ERR(mnt);
- if (IS_ERR(mnt)) {
- pr_err("overlayfs: failed to clone lowerpath\n");
- goto out_put_lower_mnt;
- }
- /*
- * Make lower_mnt R/O. That way fchmod/fchown on lower file
- * will fail instead of modifying lower fs.
- */
- mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME;
+ ofs->creator_cred = cred = prepare_creds();
+ if (!cred)
+ goto out_err;
- ufs->lower_mnt[ufs->numlower] = mnt;
- ufs->numlower++;
+ ofs->config.redirect_dir = ovl_redirect_dir_def;
+ ofs->config.index = ovl_index_def;
+ err = ovl_parse_opt((char *) data, &ofs->config);
+ if (err)
+ goto out_err;
- /* Check if all lower layers are on same sb */
- if (i == 0)
- ufs->same_sb = mnt->mnt_sb;
- else if (ufs->same_sb != mnt->mnt_sb)
- ufs->same_sb = NULL;
+ err = -EINVAL;
+ if (!ofs->config.lowerdir) {
+ if (!silent)
+ pr_err("overlayfs: missing 'lowerdir'\n");
+ goto out_err;
}
- /* If the upper fs is nonexistent, we mark overlayfs r/o too */
- if (!ufs->upper_mnt)
- sb->s_flags |= MS_RDONLY;
- else if (ufs->upper_mnt->mnt_sb != ufs->same_sb)
- ufs->same_sb = NULL;
-
- if (!(ovl_force_readonly(ufs)) && ufs->config.index) {
- /* Verify lower root is upper root origin */
- err = ovl_verify_origin(upperpath.dentry, ufs->lower_mnt[0],
- stack[0].dentry, false, true);
- if (err) {
- pr_err("overlayfs: failed to verify upper root origin\n");
- goto out_put_lower_mnt;
+ sb->s_stack_depth = 0;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ if (ofs->config.upperdir) {
+ if (!ofs->config.workdir) {
+ pr_err("overlayfs: missing 'workdir'\n");
+ goto out_err;
}
- ufs->indexdir = ovl_workdir_create(sb, ufs, workpath.dentry,
- OVL_INDEXDIR_NAME, true);
- if (ufs->indexdir) {
- /* Verify upper root is index dir origin */
- err = ovl_verify_origin(ufs->indexdir, ufs->upper_mnt,
- upperpath.dentry, true, true);
- if (err)
- pr_err("overlayfs: failed to verify index dir origin\n");
+ err = ovl_get_upper(ofs, &upperpath);
+ if (err)
+ goto out_err;
- /* Cleanup bad/stale/orphan index entries */
- if (!err)
- err = ovl_indexdir_cleanup(ufs->indexdir,
- ufs->upper_mnt,
- stack, numlower);
- }
- if (err || !ufs->indexdir)
- pr_warn("overlayfs: try deleting index dir or mounting with '-o index=off' to disable inodes index.\n");
+ err = ovl_get_workdir(ofs, &upperpath);
if (err)
- goto out_put_indexdir;
+ goto out_err;
+
+ if (!ofs->workdir)
+ sb->s_flags |= MS_RDONLY;
+
+ sb->s_stack_depth = ofs->upper_mnt->mnt_sb->s_stack_depth;
+ sb->s_time_gran = ofs->upper_mnt->mnt_sb->s_time_gran;
+
}
+ oe = ovl_get_lowerstack(sb, ofs);
+ err = PTR_ERR(oe);
+ if (IS_ERR(oe))
+ goto out_err;
- /* Show index=off/on in /proc/mounts for any of the reasons above */
- if (!ufs->indexdir)
- ufs->config.index = false;
+ /* If the upper fs is nonexistent, we mark overlayfs r/o too */
+ if (!ofs->upper_mnt)
+ sb->s_flags |= MS_RDONLY;
+ else if (ofs->upper_mnt->mnt_sb != ofs->same_sb)
+ ofs->same_sb = NULL;
- if (remote)
- sb->s_d_op = &ovl_reval_dentry_operations;
- else
- sb->s_d_op = &ovl_dentry_operations;
+ if (!(ovl_force_readonly(ofs)) && ofs->config.index) {
+ err = ovl_get_indexdir(ofs, oe, &upperpath);
+ if (err)
+ goto out_free_oe;
- err = -ENOMEM;
- ufs->creator_cred = cred = prepare_creds();
- if (!cred)
- goto out_put_indexdir;
+ if (!ofs->indexdir)
+ sb->s_flags |= MS_RDONLY;
+ }
+
+ /* Show index=off/on in /proc/mounts for any of the reasons above */
+ if (!ofs->indexdir)
+ ofs->config.index = false;
/* Never override disk quota limits or use reserved space */
cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
- err = -ENOMEM;
- oe = ovl_alloc_entry(numlower);
- if (!oe)
- goto out_put_cred;
-
sb->s_magic = OVERLAYFS_SUPER_MAGIC;
sb->s_op = &ovl_super_operations;
sb->s_xattr = ovl_xattr_handlers;
- sb->s_fs_info = ufs;
+ sb->s_fs_info = ofs;
sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK;
+ err = -ENOMEM;
root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
if (!root_dentry)
goto out_free_oe;
mntput(upperpath.mnt);
- for (i = 0; i < numlower; i++)
- mntput(stack[i].mnt);
- mntput(workpath.mnt);
- kfree(lowertmp);
-
if (upperpath.dentry) {
oe->has_upper = true;
if (ovl_is_impuredir(upperpath.dentry))
ovl_set_flag(OVL_IMPURE, d_inode(root_dentry));
}
- for (i = 0; i < numlower; i++) {
- oe->lowerstack[i].dentry = stack[i].dentry;
- oe->lowerstack[i].mnt = ufs->lower_mnt[i];
- }
- kfree(stack);
root_dentry->d_fsdata = oe;
+ /* Root is always merge -> can have whiteouts */
+ ovl_set_flag(OVL_WHITEOUTS, d_inode(root_dentry));
ovl_inode_init(d_inode(root_dentry), upperpath.dentry,
ovl_dentry_lower(root_dentry));
@@ -1149,39 +1253,11 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
return 0;
out_free_oe:
+ ovl_entry_stack_free(oe);
kfree(oe);
-out_put_cred:
- put_cred(ufs->creator_cred);
-out_put_indexdir:
- dput(ufs->indexdir);
-out_put_lower_mnt:
- for (i = 0; i < ufs->numlower; i++)
- mntput(ufs->lower_mnt[i]);
- kfree(ufs->lower_mnt);
-out_put_workdir:
- dput(ufs->workdir);
- mntput(ufs->upper_mnt);
-out_put_lowerpath:
- for (i = 0; i < numlower; i++)
- path_put(&stack[i]);
- kfree(stack);
-out_free_lowertmp:
- kfree(lowertmp);
-out_unlock_workdentry:
- if (ufs->workdir_locked)
- ovl_inuse_unlock(workpath.dentry);
-out_put_workpath:
- path_put(&workpath);
-out_unlock_upperdentry:
- if (ufs->upperdir_locked)
- ovl_inuse_unlock(upperpath.dentry);
-out_put_upperpath:
+out_err:
path_put(&upperpath);
-out_free_config:
- kfree(ufs->config.lowerdir);
- kfree(ufs->config.upperdir);
- kfree(ufs->config.workdir);
- kfree(ufs);
+ ovl_free_fs(ofs);
out:
return err;
}
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index b9b239fa5cfd..d6bb1c9f5e7a 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -17,7 +17,6 @@
#include <linux/namei.h>
#include <linux/ratelimit.h>
#include "overlayfs.h"
-#include "ovl_entry.h"
int ovl_want_write(struct dentry *dentry)
{
@@ -125,7 +124,12 @@ void ovl_path_lower(struct dentry *dentry, struct path *path)
{
struct ovl_entry *oe = dentry->d_fsdata;
- *path = oe->numlower ? oe->lowerstack[0] : (struct path) { };
+ if (oe->numlower) {
+ path->mnt = oe->lowerstack[0].layer->mnt;
+ path->dentry = oe->lowerstack[0].dentry;
+ } else {
+ *path = (struct path) { };
+ }
}
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
@@ -329,6 +333,19 @@ void ovl_copy_up_end(struct dentry *dentry)
mutex_unlock(&OVL_I(d_inode(dentry))->lock);
}
+bool ovl_check_origin_xattr(struct dentry *dentry)
+{
+ int res;
+
+ res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, NULL, 0);
+
+ /* Zero size value means "copied up but origin unknown" */
+ if (res >= 0)
+ return true;
+
+ return false;
+}
+
bool ovl_check_dir_xattr(struct dentry *dentry, const char *name)
{
int res;
diff --git a/fs/pipe.c b/fs/pipe.c
index 349c9d56d4b3..6d98566201ef 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1018,13 +1018,19 @@ const struct file_operations pipefifo_fops = {
/*
* Currently we rely on the pipe array holding a power-of-2 number
- * of pages.
+ * of pages. Returns 0 on error.
*/
-static inline unsigned int round_pipe_size(unsigned int size)
+unsigned int round_pipe_size(unsigned int size)
{
unsigned long nr_pages;
+ if (size < pipe_min_size)
+ size = pipe_min_size;
+
nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (nr_pages == 0)
+ return 0;
+
return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
}
@@ -1040,6 +1046,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
long ret = 0;
size = round_pipe_size(arg);
+ if (size == 0)
+ return -EINVAL;
nr_pages = size >> PAGE_SHIFT;
if (!nr_pages)
@@ -1117,20 +1125,13 @@ out_revert_acct:
}
/*
- * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
+ * This should work even if CONFIG_PROC_FS isn't set, as proc_dopipe_max_size
* will return an error.
*/
int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
size_t *lenp, loff_t *ppos)
{
- int ret;
-
- ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
- if (ret < 0 || !write)
- return ret;
-
- pipe_max_size = round_pipe_size(pipe_max_size);
- return ret;
+ return proc_dopipe_max_size(table, write, buf, lenp, ppos);
}
/*
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index f7456c4e7d0f..ead487e80510 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -21,6 +21,7 @@ proc-y += loadavg.o
proc-y += meminfo.o
proc-y += stat.o
proc-y += uptime.o
+proc-y += util.o
proc-y += version.o
proc-y += softirqs.o
proc-y += namespaces.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 6f6fc1672ad1..79375fc115d2 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -366,6 +366,11 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
cpumask_pr_args(&task->cpus_allowed));
}
+static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
+{
+ seq_printf(m, "CoreDumping:\t%d\n", !!mm->core_state);
+}
+
int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
@@ -376,6 +381,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
if (mm) {
task_mem(m, mm);
+ task_core_dumping(m, mm);
mmput(mm);
}
task_sig(m, task);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9d357b2ea6cb..31934cb9dfc8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1682,7 +1682,7 @@ const struct inode_operations proc_pid_link_inode_operations = {
/* building an inode */
-void task_dump_owner(struct task_struct *task, mode_t mode,
+void task_dump_owner(struct task_struct *task, umode_t mode,
kuid_t *ruid, kgid_t *rgid)
{
/* Depending on the state of dumpable compute who should own a
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index e0f867cd8553..96f1087e372c 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -1,12 +1,18 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpufreq.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+__weak void arch_freq_prepare_all(void)
+{
+}
+
extern const struct seq_operations cpuinfo_op;
static int cpuinfo_open(struct inode *inode, struct file *file)
{
+ arch_freq_prepare_all();
return seq_open(file, &cpuinfo_op);
}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index a34195e92b20..4a67188c8d74 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -100,31 +100,10 @@ static inline struct task_struct *get_proc_task(struct inode *inode)
return get_pid_task(proc_pid(inode), PIDTYPE_PID);
}
-void task_dump_owner(struct task_struct *task, mode_t mode,
+void task_dump_owner(struct task_struct *task, umode_t mode,
kuid_t *ruid, kgid_t *rgid);
-static inline unsigned name_to_int(const struct qstr *qstr)
-{
- const char *name = qstr->name;
- int len = qstr->len;
- unsigned n = 0;
-
- if (len > 1 && *name == '0')
- goto out;
- while (len-- > 0) {
- unsigned c = *name++ - '0';
- if (c > 9)
- goto out;
- if (n >= (~0U-9)/10)
- goto out;
- n *= 10;
- n += c;
- }
- return n;
-out:
- return ~0U;
-}
-
+unsigned name_to_int(const struct qstr *qstr);
/*
* Offset of the first process in the /proc root directory..
*/
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 9bc5c58c00ee..a000d7547479 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -24,7 +24,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
nr_running(), nr_threads,
- task_active_pid_ns(current)->last_pid);
+ idr_get_cursor(&task_active_pid_ns(current)->idr));
return 0;
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 875231c36cb3..339e4c1c044d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -661,6 +661,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_ACCOUNT)] = "ac",
[ilog2(VM_NORESERVE)] = "nr",
[ilog2(VM_HUGETLB)] = "ht",
+ [ilog2(VM_SYNC)] = "sf",
[ilog2(VM_ARCH_1)] = "ar",
[ilog2(VM_WIPEONFORK)] = "wf",
[ilog2(VM_DONTDUMP)] = "dd",
diff --git a/fs/proc/util.c b/fs/proc/util.c
new file mode 100644
index 000000000000..b161cfa0f9fa
--- /dev/null
+++ b/fs/proc/util.c
@@ -0,0 +1,23 @@
+#include <linux/dcache.h>
+
+unsigned name_to_int(const struct qstr *qstr)
+{
+ const char *name = qstr->name;
+ int len = qstr->len;
+ unsigned n = 0;
+
+ if (len > 1 && *name == '0')
+ goto out;
+ do {
+ unsigned c = *name++ - '0';
+ if (c > 9)
+ goto out;
+ if (n >= (~0U-9)/10)
+ goto out;
+ n *= 10;
+ n += c;
+ } while (--len > 0);
+ return n;
+out:
+ return ~0U;
+}
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 086e491faf04..423159abd501 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -651,7 +651,7 @@ static int pstore_write_user_compat(struct pstore_record *record,
return -EINVAL;
record->buf = memdup_user(buf, record->size);
- if (unlikely(IS_ERR(record->buf))) {
+ if (IS_ERR(record->buf)) {
ret = PTR_ERR(record->buf);
goto out;
}
diff --git a/fs/read_write.c b/fs/read_write.c
index 0046d72efe94..f8547b82dfb3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -635,27 +635,6 @@ SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
return ret;
}
-/*
- * Reduce an iovec's length in-place. Return the resulting number of segments
- */
-unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
-{
- unsigned long seg = 0;
- size_t len = 0;
-
- while (seg < nr_segs) {
- seg++;
- if (len + iov->iov_len >= to) {
- iov->iov_len = to - len;
- break;
- }
- len += iov->iov_len;
- iov++;
- }
- return seg;
-}
-EXPORT_SYMBOL(iov_shorten);
-
static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
loff_t *ppos, int type, rwf_t flags)
{
diff --git a/fs/select.c b/fs/select.c
index 063067e606ca..6de493bb42a4 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -292,8 +292,7 @@ static int poll_select_copy_remaining(struct timespec64 *end_time,
void __user *p,
int timeval, int ret)
{
- struct timespec64 rts64;
- struct timespec rts;
+ struct timespec64 rts;
struct timeval rtv;
if (!p)
@@ -306,23 +305,22 @@ static int poll_select_copy_remaining(struct timespec64 *end_time,
if (!end_time->tv_sec && !end_time->tv_nsec)
return ret;
- ktime_get_ts64(&rts64);
- rts64 = timespec64_sub(*end_time, rts64);
- if (rts64.tv_sec < 0)
- rts64.tv_sec = rts64.tv_nsec = 0;
+ ktime_get_ts64(&rts);
+ rts = timespec64_sub(*end_time, rts);
+ if (rts.tv_sec < 0)
+ rts.tv_sec = rts.tv_nsec = 0;
- rts = timespec64_to_timespec(rts64);
if (timeval) {
if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
memset(&rtv, 0, sizeof(rtv));
- rtv.tv_sec = rts64.tv_sec;
- rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC;
+ rtv.tv_sec = rts.tv_sec;
+ rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
if (!copy_to_user(p, &rtv, sizeof(rtv)))
return ret;
- } else if (!copy_to_user(p, &rts, sizeof(rts)))
+ } else if (!put_timespec64(&rts, p))
return ret;
/*
@@ -705,17 +703,15 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
const sigset_t __user *sigmask, size_t sigsetsize)
{
sigset_t ksigmask, sigsaved;
- struct timespec ts;
- struct timespec64 ts64, end_time, *to = NULL;
+ struct timespec64 ts, end_time, *to = NULL;
int ret;
if (tsp) {
- if (copy_from_user(&ts, tsp, sizeof(ts)))
+ if (get_timespec64(&ts, tsp))
return -EFAULT;
- ts64 = timespec_to_timespec64(ts);
to = &end_time;
- if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec))
+ if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
return -EINVAL;
}
@@ -1052,12 +1048,11 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
size_t, sigsetsize)
{
sigset_t ksigmask, sigsaved;
- struct timespec ts;
- struct timespec64 end_time, *to = NULL;
+ struct timespec64 ts, end_time, *to = NULL;
int ret;
if (tsp) {
- if (copy_from_user(&ts, tsp, sizeof(ts)))
+ if (get_timespec64(&ts, tsp))
return -EFAULT;
to = &end_time;
@@ -1103,10 +1098,10 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
#define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t))
static
-int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+int compat_poll_select_copy_remaining(struct timespec64 *end_time, void __user *p,
int timeval, int ret)
{
- struct timespec ts;
+ struct timespec64 ts;
if (!p)
return ret;
@@ -1118,8 +1113,8 @@ int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p,
if (!end_time->tv_sec && !end_time->tv_nsec)
return ret;
- ktime_get_ts(&ts);
- ts = timespec_sub(*end_time, ts);
+ ktime_get_ts64(&ts);
+ ts = timespec64_sub(*end_time, ts);
if (ts.tv_sec < 0)
ts.tv_sec = ts.tv_nsec = 0;
@@ -1132,12 +1127,7 @@ int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p,
if (!copy_to_user(p, &rtv, sizeof(rtv)))
return ret;
} else {
- struct compat_timespec rts;
-
- rts.tv_sec = ts.tv_sec;
- rts.tv_nsec = ts.tv_nsec;
-
- if (!copy_to_user(p, &rts, sizeof(rts)))
+ if (!compat_put_timespec64(&ts, p))
return ret;
}
/*
@@ -1195,7 +1185,7 @@ int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
*/
static int compat_core_sys_select(int n, compat_ulong_t __user *inp,
compat_ulong_t __user *outp, compat_ulong_t __user *exp,
- struct timespec *end_time)
+ struct timespec64 *end_time)
{
fd_set_bits fds;
void *bits;
@@ -1268,7 +1258,7 @@ COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
struct compat_timeval __user *, tvp)
{
- struct timespec end_time, *to = NULL;
+ struct timespec64 end_time, *to = NULL;
struct compat_timeval tv;
int ret;
@@ -1312,14 +1302,12 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
compat_size_t sigsetsize)
{
- compat_sigset_t ss32;
sigset_t ksigmask, sigsaved;
- struct compat_timespec ts;
- struct timespec end_time, *to = NULL;
+ struct timespec64 ts, end_time, *to = NULL;
int ret;
if (tsp) {
- if (copy_from_user(&ts, tsp, sizeof(ts)))
+ if (compat_get_timespec64(&ts, tsp))
return -EFAULT;
to = &end_time;
@@ -1330,9 +1318,8 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
if (sigmask) {
if (sigsetsize != sizeof(compat_sigset_t))
return -EINVAL;
- if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+ if (get_compat_sigset(&ksigmask, sigmask))
return -EFAULT;
- sigset_from_compat(&ksigmask, &ss32);
sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
@@ -1381,14 +1368,12 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
unsigned int, nfds, struct compat_timespec __user *, tsp,
const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
- compat_sigset_t ss32;
sigset_t ksigmask, sigsaved;
- struct compat_timespec ts;
- struct timespec end_time, *to = NULL;
+ struct timespec64 ts, end_time, *to = NULL;
int ret;
if (tsp) {
- if (copy_from_user(&ts, tsp, sizeof(ts)))
+ if (compat_get_timespec64(&ts, tsp))
return -EFAULT;
to = &end_time;
@@ -1399,9 +1384,8 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
if (sigmask) {
if (sigsetsize != sizeof(compat_sigset_t))
return -EINVAL;
- if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+ if (get_compat_sigset(&ksigmask, sigmask))
return -EFAULT;
- sigset_from_compat(&ksigmask, &ss32);
sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 1c667af86da5..5f1ff8756595 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -313,15 +313,13 @@ COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd,
compat_size_t, sigsetsize,
int, flags)
{
- compat_sigset_t ss32;
sigset_t tmp;
sigset_t __user *ksigmask;
if (sigsetsize != sizeof(compat_sigset_t))
return -EINVAL;
- if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+ if (get_compat_sigset(&tmp, sigmask))
return -EFAULT;
- sigset_from_compat(&tmp, &ss32);
ksigmask = compat_alloc_user_space(sizeof(sigset_t));
if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t)))
return -EFAULT;
diff --git a/fs/statfs.c b/fs/statfs.c
index c25dd9a26cc1..b072a8bab71a 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -217,7 +217,7 @@ SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user
return error;
}
-int vfs_ustat(dev_t dev, struct kstatfs *sbuf)
+static int vfs_ustat(dev_t dev, struct kstatfs *sbuf)
{
struct super_block *s = user_get_super(dev);
int err;
diff --git a/fs/super.c b/fs/super.c
index 994db21f59bf..d4e33e8f1e6f 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -155,21 +155,19 @@ static void destroy_super_rcu(struct rcu_head *head)
schedule_work(&s->destroy_work);
}
-/**
- * destroy_super - frees a superblock
- * @s: superblock to free
- *
- * Frees a superblock.
- */
-static void destroy_super(struct super_block *s)
+/* Free a superblock that has never been seen by anyone */
+static void destroy_unused_super(struct super_block *s)
{
+ if (!s)
+ return;
+ up_write(&s->s_umount);
list_lru_destroy(&s->s_dentry_lru);
list_lru_destroy(&s->s_inode_lru);
security_sb_free(s);
- WARN_ON(!list_empty(&s->s_mounts));
put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
- call_rcu(&s->rcu, destroy_super_rcu);
+ /* no delays needed */
+ destroy_super_work(&s->destroy_work);
}
/**
@@ -257,7 +255,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
return s;
fail:
- destroy_super(s);
+ destroy_unused_super(s);
return NULL;
}
@@ -266,11 +264,17 @@ fail:
/*
* Drop a superblock's refcount. The caller must hold sb_lock.
*/
-static void __put_super(struct super_block *sb)
+static void __put_super(struct super_block *s)
{
- if (!--sb->s_count) {
- list_del_init(&sb->s_list);
- destroy_super(sb);
+ if (!--s->s_count) {
+ list_del_init(&s->s_list);
+ WARN_ON(s->s_dentry_lru.node);
+ WARN_ON(s->s_inode_lru.node);
+ WARN_ON(!list_empty(&s->s_mounts));
+ security_sb_free(s);
+ put_user_ns(s->s_user_ns);
+ kfree(s->s_subtype);
+ call_rcu(&s->rcu, destroy_super_rcu);
}
}
@@ -485,19 +489,12 @@ retry:
continue;
if (user_ns != old->s_user_ns) {
spin_unlock(&sb_lock);
- if (s) {
- up_write(&s->s_umount);
- destroy_super(s);
- }
+ destroy_unused_super(s);
return ERR_PTR(-EBUSY);
}
if (!grab_super(old))
goto retry;
- if (s) {
- up_write(&s->s_umount);
- destroy_super(s);
- s = NULL;
- }
+ destroy_unused_super(s);
return old;
}
}
@@ -512,8 +509,7 @@ retry:
err = set(s, data);
if (err) {
spin_unlock(&sb_lock);
- up_write(&s->s_umount);
- destroy_super(s);
+ destroy_unused_super(s);
return ERR_PTR(err);
}
s->s_type = type;
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 343a94246f5b..89bf16b4d937 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -302,7 +302,7 @@ xfs_iext_rec_cmp(
xfs_fileoff_t offset)
{
uint64_t rec_offset = rec->lo & XFS_IEXT_STARTOFF_MASK;
- u32 rec_len = rec->hi & XFS_IEXT_LENGTH_MASK;
+ uint32_t rec_len = rec->hi & XFS_IEXT_LENGTH_MASK;
if (rec_offset > offset)
return 1;
@@ -850,9 +850,9 @@ static void
xfs_iext_free_last_leaf(
struct xfs_ifork *ifp)
{
- ifp->if_u1.if_root = NULL;
ifp->if_height--;
kmem_free(ifp->if_u1.if_root);
+ ifp->if_u1.if_root = NULL;
}
void
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 1c90ec41e9df..c79a1616b79d 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -42,11 +42,6 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
-static inline dev_t xfs_to_linux_dev_t(xfs_dev_t dev)
-{
- return MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev));
-}
-
/*
* Copy inode type and data and attr format specific information from the
* on-disk inode to the in-core inode and fork structures. For fifos, devices,
@@ -792,7 +787,8 @@ xfs_iflush_fork(
case XFS_DINODE_FMT_DEV:
if (iip->ili_fields & XFS_ILOG_DEV) {
ASSERT(whichfork == XFS_DATA_FORK);
- xfs_dinode_put_rdev(dip, sysv_encode_dev(VFS_I(ip)->i_rdev));
+ xfs_dinode_put_rdev(dip,
+ linux_to_xfs_dev_t(VFS_I(ip)->i_rdev));
}
break;
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 996f035ee205..349d9f8edb89 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -274,7 +274,7 @@ struct xfs_inode_log_format {
uint64_t ilf_ino; /* inode number */
union {
uint32_t ilfu_rdev; /* rdev value for dev inode*/
- u8 __pad[16]; /* unused */
+ uint8_t __pad[16]; /* unused */
} ilf_u;
int64_t ilf_blkno; /* blkno of inode buffer */
int32_t ilf_len; /* len of inode buffer */
@@ -295,7 +295,7 @@ struct xfs_inode_log_format_32 {
uint64_t ilf_ino; /* inode number */
union {
uint32_t ilfu_rdev; /* rdev value for dev inode*/
- u8 __pad[16]; /* unused */
+ uint8_t __pad[16]; /* unused */
} ilf_u;
int64_t ilf_blkno; /* blkno of inode buffer */
int32_t ilf_len; /* len of inode buffer */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 18146873a8b3..8601275cc5e6 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -44,6 +44,7 @@
#include <linux/falloc.h>
#include <linux/pagevec.h>
#include <linux/backing-dev.h>
+#include <linux/mman.h>
static const struct vm_operations_struct xfs_file_vm_ops;
@@ -1045,7 +1046,11 @@ __xfs_filemap_fault(
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
- ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
+ pfn_t pfn;
+
+ ret = dax_iomap_fault(vmf, pe_size, &pfn, &xfs_iomap_ops);
+ if (ret & VM_FAULT_NEEDDSYNC)
+ ret = dax_finish_sync_fault(vmf, pe_size, pfn);
} else {
if (write_fault)
ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
@@ -1090,37 +1095,16 @@ xfs_filemap_page_mkwrite(
}
/*
- * pfn_mkwrite was originally inteneded to ensure we capture time stamp
- * updates on write faults. In reality, it's need to serialise against
- * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED
- * to ensure we serialise the fault barrier in place.
+ * pfn_mkwrite was originally intended to ensure we capture time stamp updates
+ * on write faults. In reality, it needs to serialise against truncate and
+ * prepare memory for writing so handle is as standard write fault.
*/
static int
xfs_filemap_pfn_mkwrite(
struct vm_fault *vmf)
{
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct xfs_inode *ip = XFS_I(inode);
- int ret = VM_FAULT_NOPAGE;
- loff_t size;
-
- trace_xfs_filemap_pfn_mkwrite(ip);
-
- sb_start_pagefault(inode->i_sb);
- file_update_time(vmf->vma->vm_file);
-
- /* check if the faulting page hasn't raced with truncate */
- xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (vmf->pgoff >= size)
- ret = VM_FAULT_SIGBUS;
- else if (IS_DAX(inode))
- ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
- xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
- sb_end_pagefault(inode->i_sb);
- return ret;
-
+ return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
}
static const struct vm_operations_struct xfs_file_vm_ops = {
@@ -1136,6 +1120,13 @@ xfs_file_mmap(
struct file *filp,
struct vm_area_struct *vma)
{
+ /*
+ * We don't support synchronous mappings for non-DAX files. At least
+ * until someone comes with a sensible use case.
+ */
+ if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
+ return -EOPNOTSUPP;
+
file_accessed(filp);
vma->vm_ops = &xfs_file_vm_ops;
if (IS_DAX(file_inode(filp)))
@@ -1154,6 +1145,7 @@ const struct file_operations xfs_file_operations = {
.compat_ioctl = xfs_file_compat_ioctl,
#endif
.mmap = xfs_file_mmap,
+ .mmap_supported_flags = MAP_SYNC,
.open = xfs_file_open,
.release = xfs_file_release,
.fsync = xfs_file_fsync,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index d8226f7a5dde..61d1cb7dc10d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2357,6 +2357,7 @@ retry:
*/
if (ip->i_ino != inum + i) {
xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ rcu_read_unlock();
continue;
}
}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 18077e2189a9..33eb4fb2e3fd 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -34,6 +34,7 @@
#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_trans_space.h"
+#include "xfs_inode_item.h"
#include "xfs_iomap.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
@@ -1089,6 +1090,10 @@ xfs_file_iomap_begin(
trace_xfs_iomap_found(ip, offset, length, 0, &imap);
}
+ if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
+ & ~XFS_ILOG_TIMESTAMP))
+ iomap->flags |= IOMAP_F_DIRTY;
+
xfs_bmbt_to_iomap(ip, iomap, &imap);
if (shared)
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 6282bfc1afa9..99562ec0de56 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -204,6 +204,16 @@ static inline kgid_t xfs_gid_to_kgid(uint32_t gid)
return make_kgid(&init_user_ns, gid);
}
+static inline dev_t xfs_to_linux_dev_t(xfs_dev_t dev)
+{
+ return MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev));
+}
+
+static inline xfs_dev_t linux_to_xfs_dev_t(dev_t dev)
+{
+ return sysv_encode_dev(dev);
+}
+
/*
* Various platform dependent calls that don't fit anywhere else
*/
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 515ba042d75c..d718a10c2271 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -654,8 +654,6 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
-DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
-
TRACE_EVENT(xfs_filemap_fault,
TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
bool write_fault),