summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/callback.c4
-rw-r--r--fs/afs/inode.c31
-rw-r--r--fs/afs/internal.h8
-rw-r--r--fs/afs/volume.c1
-rw-r--r--fs/aio.c28
-rw-r--r--fs/binfmt_flat.c23
-rw-r--r--fs/ceph/mds_client.c3
-rw-r--r--fs/cifs/smb2ops.c64
-rw-r--r--fs/cifs/smb2pdu.h14
-rw-r--r--fs/dax.c9
-rw-r--r--fs/eventpoll.c4
-rw-r--r--fs/inode.c2
-rw-r--r--fs/io_uring.c12
-rw-r--r--fs/namespace.c7
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c2
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/proc/base.c3
-rw-r--r--fs/select.c18
-rw-r--r--fs/userfaultfd.c42
19 files changed, 179 insertions, 98 deletions
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index d441bef72163..915010464572 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -275,9 +275,9 @@ static void afs_break_one_callback(struct afs_server *server,
struct afs_super_info *as = AFS_FS_S(cbi->sb);
struct afs_volume *volume = as->volume;
- write_lock(&volume->cb_break_lock);
+ write_lock(&volume->cb_v_break_lock);
volume->cb_v_break++;
- write_unlock(&volume->cb_break_lock);
+ write_unlock(&volume->cb_v_break_lock);
} else {
data.volume = NULL;
data.fid = *fid;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index b42d9d09669c..18a50d4febcf 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -56,6 +56,16 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren
}
/*
+ * Set the file size and block count. Estimate the number of 512 bytes blocks
+ * used, rounded up to nearest 1K for consistency with other AFS clients.
+ */
+static void afs_set_i_size(struct afs_vnode *vnode, u64 size)
+{
+ i_size_write(&vnode->vfs_inode, size);
+ vnode->vfs_inode.i_blocks = ((size + 1023) >> 10) << 1;
+}
+
+/*
* Initialise an inode from the vnode status.
*/
static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key,
@@ -124,12 +134,7 @@ static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key,
return afs_protocol_error(NULL, -EBADMSG, afs_eproto_file_type);
}
- /*
- * Estimate 512 bytes blocks used, rounded up to nearest 1K
- * for consistency with other AFS clients.
- */
- inode->i_blocks = ((i_size_read(inode) + 1023) >> 10) << 1;
- i_size_write(&vnode->vfs_inode, status->size);
+ afs_set_i_size(vnode, status->size);
vnode->invalid_before = status->data_version;
inode_set_iversion_raw(&vnode->vfs_inode, status->data_version);
@@ -207,11 +212,13 @@ static void afs_apply_status(struct afs_fs_cursor *fc,
if (expected_version &&
*expected_version != status->data_version) {
- kdebug("vnode modified %llx on {%llx:%llu} [exp %llx] %s",
- (unsigned long long) status->data_version,
- vnode->fid.vid, vnode->fid.vnode,
- (unsigned long long) *expected_version,
- fc->type ? fc->type->name : "???");
+ if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
+ pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s\n",
+ vnode->fid.vid, vnode->fid.vnode,
+ (unsigned long long)*expected_version,
+ (unsigned long long)status->data_version,
+ fc->type ? fc->type->name : "???");
+
vnode->invalid_before = status->data_version;
if (vnode->status.type == AFS_FTYPE_DIR) {
if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
@@ -230,7 +237,7 @@ static void afs_apply_status(struct afs_fs_cursor *fc,
if (data_changed) {
inode_set_iversion_raw(&vnode->vfs_inode, status->data_version);
- i_size_write(&vnode->vfs_inode, status->size);
+ afs_set_i_size(vnode, status->size);
}
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 8a67bf741880..7ee63526c6a2 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -109,10 +109,8 @@ struct afs_call {
struct rxrpc_call *rxcall; /* RxRPC call handle */
struct key *key; /* security for this call */
struct afs_net *net; /* The network namespace */
- union {
- struct afs_server *server;
- struct afs_vlserver *vlserver;
- };
+ struct afs_server *server; /* The fileserver record if fs op (pins ref) */
+ struct afs_vlserver *vlserver; /* The vlserver record if vl op */
struct afs_cb_interest *cbi; /* Callback interest for server used */
struct afs_vnode *lvnode; /* vnode being locked */
void *request; /* request data (first part) */
@@ -616,7 +614,7 @@ struct afs_volume {
unsigned int servers_seq; /* Incremented each time ->servers changes */
unsigned cb_v_break; /* Break-everything counter. */
- rwlock_t cb_break_lock;
+ rwlock_t cb_v_break_lock;
afs_voltype_t type; /* type of volume */
short error;
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 08fdb3951c49..1a414300b654 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -43,6 +43,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
atomic_set(&volume->usage, 1);
INIT_LIST_HEAD(&volume->proc_link);
rwlock_init(&volume->servers_lock);
+ rwlock_init(&volume->cb_v_break_lock);
memcpy(volume->name, vldb->name, vldb->name_len + 1);
slist = afs_alloc_server_list(params->cell, params->key, vldb, type_mask);
diff --git a/fs/aio.c b/fs/aio.c
index 3490d1fa0e16..c1e581dd32f5 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2095,6 +2095,7 @@ SYSCALL_DEFINE6(io_pgetevents,
struct __aio_sigset ksig = { NULL, };
sigset_t ksigmask, sigsaved;
struct timespec64 ts;
+ bool interrupted;
int ret;
if (timeout && unlikely(get_timespec64(&ts, timeout)))
@@ -2108,8 +2109,10 @@ SYSCALL_DEFINE6(io_pgetevents,
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
- restore_user_sigmask(ksig.sigmask, &sigsaved);
- if (signal_pending(current) && !ret)
+
+ interrupted = signal_pending(current);
+ restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+ if (interrupted && !ret)
ret = -ERESTARTNOHAND;
return ret;
@@ -2128,6 +2131,7 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
struct __aio_sigset ksig = { NULL, };
sigset_t ksigmask, sigsaved;
struct timespec64 ts;
+ bool interrupted;
int ret;
if (timeout && unlikely(get_old_timespec32(&ts, timeout)))
@@ -2142,8 +2146,10 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
- restore_user_sigmask(ksig.sigmask, &sigsaved);
- if (signal_pending(current) && !ret)
+
+ interrupted = signal_pending(current);
+ restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+ if (interrupted && !ret)
ret = -ERESTARTNOHAND;
return ret;
@@ -2193,6 +2199,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
struct __compat_aio_sigset ksig = { NULL, };
sigset_t ksigmask, sigsaved;
struct timespec64 t;
+ bool interrupted;
int ret;
if (timeout && get_old_timespec32(&t, timeout))
@@ -2206,8 +2213,10 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
- restore_user_sigmask(ksig.sigmask, &sigsaved);
- if (signal_pending(current) && !ret)
+
+ interrupted = signal_pending(current);
+ restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+ if (interrupted && !ret)
ret = -ERESTARTNOHAND;
return ret;
@@ -2226,6 +2235,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
struct __compat_aio_sigset ksig = { NULL, };
sigset_t ksigmask, sigsaved;
struct timespec64 t;
+ bool interrupted;
int ret;
if (timeout && get_timespec64(&t, timeout))
@@ -2239,8 +2249,10 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
- restore_user_sigmask(ksig.sigmask, &sigsaved);
- if (signal_pending(current) && !ret)
+
+ interrupted = signal_pending(current);
+ restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+ if (interrupted && !ret)
ret = -ERESTARTNOHAND;
return ret;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 82a48e830018..e4b59e76afb0 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -856,9 +856,14 @@ err:
static int load_flat_shared_library(int id, struct lib_info *libs)
{
+ /*
+ * This is a fake bprm struct; only the members "buf", "file" and
+ * "filename" are actually used.
+ */
struct linux_binprm bprm;
int res;
char buf[16];
+ loff_t pos = 0;
memset(&bprm, 0, sizeof(bprm));
@@ -872,25 +877,11 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
if (IS_ERR(bprm.file))
return res;
- bprm.cred = prepare_exec_creds();
- res = -ENOMEM;
- if (!bprm.cred)
- goto out;
-
- /* We don't really care about recalculating credentials at this point
- * as we're past the point of no return and are dealing with shared
- * libraries.
- */
- bprm.called_set_creds = 1;
+ res = kernel_read(bprm.file, bprm.buf, BINPRM_BUF_SIZE, &pos);
- res = prepare_binprm(&bprm);
-
- if (!res)
+ if (res >= 0)
res = load_flat_file(&bprm, libs, id, NULL);
- abort_creds(bprm.cred);
-
-out:
allow_write_access(bprm.file);
fput(bprm.file);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6af2d0d4a87a..c8a9b89b922d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2121,9 +2121,10 @@ retry:
if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
dout("build_path path+%d: %p SNAPDIR\n",
pos, temp);
- } else if (stop_on_nosnap && inode &&
+ } else if (stop_on_nosnap && inode && dentry != temp &&
ceph_snap(inode) == CEPH_NOSNAP) {
spin_unlock(&temp->d_lock);
+ pos++; /* get rid of any prepended '/' */
break;
} else {
pos -= temp->d_name.len;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 3fdc6a41b304..9fd56b0acd7e 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -2372,6 +2372,41 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
kfree(dfs_rsp);
return rc;
}
+
+static int
+parse_reparse_symlink(struct reparse_symlink_data_buffer *symlink_buf,
+ u32 plen, char **target_path,
+ struct cifs_sb_info *cifs_sb)
+{
+ unsigned int sub_len;
+ unsigned int sub_offset;
+
+ /* We only handle Symbolic Link : MS-FSCC 2.1.2.4 */
+ if (le32_to_cpu(symlink_buf->ReparseTag) != IO_REPARSE_TAG_SYMLINK) {
+ cifs_dbg(VFS, "srv returned invalid symlink buffer\n");
+ return -EIO;
+ }
+
+ sub_offset = le16_to_cpu(symlink_buf->SubstituteNameOffset);
+ sub_len = le16_to_cpu(symlink_buf->SubstituteNameLength);
+ if (sub_offset + 20 > plen ||
+ sub_offset + sub_len + 20 > plen) {
+ cifs_dbg(VFS, "srv returned malformed symlink buffer\n");
+ return -EIO;
+ }
+
+ *target_path = cifs_strndup_from_utf16(
+ symlink_buf->PathBuffer + sub_offset,
+ sub_len, true, cifs_sb->local_nls);
+ if (!(*target_path))
+ return -ENOMEM;
+
+ convert_delimiter(*target_path, '/');
+ cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
+
+ return 0;
+}
+
#define SMB2_SYMLINK_STRUCT_SIZE \
(sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp))
@@ -2401,11 +2436,13 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
struct kvec close_iov[1];
struct smb2_create_rsp *create_rsp;
struct smb2_ioctl_rsp *ioctl_rsp;
- char *ioctl_buf;
+ struct reparse_data_buffer *reparse_buf;
u32 plen;
cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
+ *target_path = NULL;
+
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -2483,17 +2520,36 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
if ((rc == 0) && (is_reparse_point)) {
/* See MS-FSCC 2.3.23 */
- ioctl_buf = (char *)ioctl_rsp + le32_to_cpu(ioctl_rsp->OutputOffset);
+ reparse_buf = (struct reparse_data_buffer *)
+ ((char *)ioctl_rsp +
+ le32_to_cpu(ioctl_rsp->OutputOffset));
plen = le32_to_cpu(ioctl_rsp->OutputCount);
if (plen + le32_to_cpu(ioctl_rsp->OutputOffset) >
rsp_iov[1].iov_len) {
- cifs_dbg(VFS, "srv returned invalid ioctl length: %d\n", plen);
+ cifs_dbg(VFS, "srv returned invalid ioctl len: %d\n",
+ plen);
+ rc = -EIO;
+ goto querty_exit;
+ }
+
+ if (plen < 8) {
+ cifs_dbg(VFS, "reparse buffer is too small. Must be "
+ "at least 8 bytes but was %d\n", plen);
+ rc = -EIO;
+ goto querty_exit;
+ }
+
+ if (plen < le16_to_cpu(reparse_buf->ReparseDataLength) + 8) {
+ cifs_dbg(VFS, "srv returned invalid reparse buf "
+ "length: %d\n", plen);
rc = -EIO;
goto querty_exit;
}
- /* Do stuff with ioctl_buf/plen */
+ rc = parse_reparse_symlink(
+ (struct reparse_symlink_data_buffer *)reparse_buf,
+ plen, target_path, cifs_sb);
goto querty_exit;
}
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index c7d5813bebd8..858353d20c39 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -914,7 +914,19 @@ struct reparse_mount_point_data_buffer {
__u8 PathBuffer[0]; /* Variable Length */
} __packed;
-/* See MS-FSCC 2.1.2.4 and cifspdu.h for struct reparse_symlink_data */
+#define SYMLINK_FLAG_RELATIVE 0x00000001
+
+struct reparse_symlink_data_buffer {
+ __le32 ReparseTag;
+ __le16 ReparseDataLength;
+ __u16 Reserved;
+ __le16 SubstituteNameOffset;
+ __le16 SubstituteNameLength;
+ __le16 PrintNameOffset;
+ __le16 PrintNameLength;
+ __le32 Flags;
+ __u8 PathBuffer[0]; /* Variable Length */
+} __packed;
/* See MS-FSCC 2.1.2.6 and cifspdu.h for struct reparse_posix_data */
diff --git a/fs/dax.c b/fs/dax.c
index 2e48c7ebb973..d2c90bf1969a 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -720,12 +720,11 @@ static void *dax_insert_entry(struct xa_state *xas,
xas_reset(xas);
xas_lock_irq(xas);
- if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
+ if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+ void *old;
+
dax_disassociate_entry(entry, mapping, false);
dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
- }
-
- if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
/*
* Only swap our new entry into the page cache if the current
* entry is a zero page or an empty entry. If a normal PTE or
@@ -734,7 +733,7 @@ static void *dax_insert_entry(struct xa_state *xas,
* existing entry is a PMD, we will just leave the PMD in the
* tree and dirty it if necessary.
*/
- void *old = dax_lock_entry(xas, new_entry);
+ old = dax_lock_entry(xas, new_entry);
WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
DAX_LOCKED));
entry = new_entry;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index c6f513100cc9..4c74c768ae43 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2325,7 +2325,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
error = do_epoll_wait(epfd, events, maxevents, timeout);
- restore_user_sigmask(sigmask, &sigsaved);
+ restore_user_sigmask(sigmask, &sigsaved, error == -EINTR);
return error;
}
@@ -2350,7 +2350,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
err = do_epoll_wait(epfd, events, maxevents, timeout);
- restore_user_sigmask(sigmask, &sigsaved);
+ restore_user_sigmask(sigmask, &sigsaved, err == -EINTR);
return err;
}
diff --git a/fs/inode.c b/fs/inode.c
index df6542ec3b88..2bf21e2c90fc 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -362,7 +362,7 @@ EXPORT_SYMBOL(inc_nlink);
static void __address_space_init_once(struct address_space *mapping)
{
- xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ);
+ xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
init_rwsem(&mapping->i_mmap_rwsem);
INIT_LIST_HEAD(&mapping->private_list);
spin_lock_init(&mapping->private_lock);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 86a2bd721900..4ef62a45045d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -579,6 +579,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
state->cur_req++;
}
+ req->file = NULL;
req->ctx = ctx;
req->flags = 0;
/* one is dropped after submission, the other at completion */
@@ -1801,10 +1802,8 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
req->sequence = ctx->cached_sq_head - 1;
}
- if (!io_op_needs_file(s->sqe)) {
- req->file = NULL;
+ if (!io_op_needs_file(s->sqe))
return 0;
- }
if (flags & IOSQE_FIXED_FILE) {
if (unlikely(!ctx->user_files ||
@@ -2201,11 +2200,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
}
ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events);
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
if (sig)
- restore_user_sigmask(sig, &sigsaved);
+ restore_user_sigmask(sig, &sigsaved, ret == -ERESTARTSYS);
+
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index 7660c2749c96..6fbc9126367a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2596,11 +2596,12 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
if (!check_mnt(p))
goto out;
- /* The thing moved should be either ours or completely unattached. */
- if (attached && !check_mnt(old))
+ /* The thing moved must be mounted... */
+ if (!is_mounted(&old->mnt))
goto out;
- if (!attached && !(ns && is_anon_ns(ns)))
+ /* ... and either ours or the root of anon namespace */
+ if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
goto out;
if (old->mnt.mnt_flags & MNT_LOCKED)
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index a809989807d6..19f856f45689 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -18,7 +18,7 @@
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-static unsigned int dataserver_timeo = NFS_DEF_TCP_RETRANS;
+static unsigned int dataserver_timeo = NFS_DEF_TCP_TIMEO;
static unsigned int dataserver_retrans;
static bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 2edbb657f859..55180501b915 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -462,7 +462,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
* a program is not able to use ptrace(2) in that case. It is
* safe because the task has stopped executing permanently.
*/
- if (permitted && (task->flags & PF_DUMPCORE)) {
+ if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE))) {
if (try_get_task_stack(task)) {
eip = KSTK_EIP(task);
esp = KSTK_ESP(task);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9c8ca6cd3ce4..255f6754c70d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3077,8 +3077,7 @@ static const struct file_operations proc_tgid_base_operations = {
struct pid *tgid_pidfd_to_pid(const struct file *file)
{
- if (!d_is_dir(file->f_path.dentry) ||
- (file->f_op != &proc_tgid_base_operations))
+ if (file->f_op != &proc_tgid_base_operations)
return ERR_PTR(-EBADF);
return proc_pid(file_inode(file));
diff --git a/fs/select.c b/fs/select.c
index 6cbc9ff56ba0..a4d8f6e8b63c 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -758,10 +758,9 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
return ret;
ret = core_sys_select(n, inp, outp, exp, to);
+ restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND);
ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
- restore_user_sigmask(sigmask, &sigsaved);
-
return ret;
}
@@ -1106,8 +1105,7 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved);
-
+ restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
@@ -1142,8 +1140,7 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved);
-
+ restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
@@ -1350,10 +1347,9 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
return ret;
ret = compat_core_sys_select(n, inp, outp, exp, to);
+ restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND);
ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
- restore_user_sigmask(sigmask, &sigsaved);
-
return ret;
}
@@ -1425,8 +1421,7 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved);
-
+ restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
@@ -1461,8 +1456,7 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved);
-
+ restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index ae0b8b5f69e6..ccbdbd62f0d8 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -40,6 +40,16 @@ enum userfaultfd_state {
/*
* Start with fault_pending_wqh and fault_wqh so they're more likely
* to be in the same cacheline.
+ *
+ * Locking order:
+ * fd_wqh.lock
+ * fault_pending_wqh.lock
+ * fault_wqh.lock
+ * event_wqh.lock
+ *
+ * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
+ * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
+ * also taken in IRQ context.
*/
struct userfaultfd_ctx {
/* waitqueue head for the pending (i.e. not read) userfaults */
@@ -458,7 +468,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
blocking_state = return_to_userland ? TASK_INTERRUPTIBLE :
TASK_KILLABLE;
- spin_lock(&ctx->fault_pending_wqh.lock);
+ spin_lock_irq(&ctx->fault_pending_wqh.lock);
/*
* After the __add_wait_queue the uwq is visible to userland
* through poll/read().
@@ -470,7 +480,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
* __add_wait_queue.
*/
set_current_state(blocking_state);
- spin_unlock(&ctx->fault_pending_wqh.lock);
+ spin_unlock_irq(&ctx->fault_pending_wqh.lock);
if (!is_vm_hugetlb_page(vmf->vma))
must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
@@ -552,13 +562,13 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
* kernel stack can be released after the list_del_init.
*/
if (!list_empty_careful(&uwq.wq.entry)) {
- spin_lock(&ctx->fault_pending_wqh.lock);
+ spin_lock_irq(&ctx->fault_pending_wqh.lock);
/*
* No need of list_del_init(), the uwq on the stack
* will be freed shortly anyway.
*/
list_del(&uwq.wq.entry);
- spin_unlock(&ctx->fault_pending_wqh.lock);
+ spin_unlock_irq(&ctx->fault_pending_wqh.lock);
}
/*
@@ -583,7 +593,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
init_waitqueue_entry(&ewq->wq, current);
release_new_ctx = NULL;
- spin_lock(&ctx->event_wqh.lock);
+ spin_lock_irq(&ctx->event_wqh.lock);
/*
* After the __add_wait_queue the uwq is visible to userland
* through poll/read().
@@ -613,15 +623,15 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
break;
}
- spin_unlock(&ctx->event_wqh.lock);
+ spin_unlock_irq(&ctx->event_wqh.lock);
wake_up_poll(&ctx->fd_wqh, EPOLLIN);
schedule();
- spin_lock(&ctx->event_wqh.lock);
+ spin_lock_irq(&ctx->event_wqh.lock);
}
__set_current_state(TASK_RUNNING);
- spin_unlock(&ctx->event_wqh.lock);
+ spin_unlock_irq(&ctx->event_wqh.lock);
if (release_new_ctx) {
struct vm_area_struct *vma;
@@ -918,10 +928,10 @@ wakeup:
* the last page faults that may have been already waiting on
* the fault_*wqh.
*/
- spin_lock(&ctx->fault_pending_wqh.lock);
+ spin_lock_irq(&ctx->fault_pending_wqh.lock);
__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
- spin_unlock(&ctx->fault_pending_wqh.lock);
+ spin_unlock_irq(&ctx->fault_pending_wqh.lock);
/* Flush pending events that may still wait on event_wqh */
wake_up_all(&ctx->event_wqh);
@@ -1134,7 +1144,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
if (!ret && msg->event == UFFD_EVENT_FORK) {
ret = resolve_userfault_fork(ctx, fork_nctx, msg);
- spin_lock(&ctx->event_wqh.lock);
+ spin_lock_irq(&ctx->event_wqh.lock);
if (!list_empty(&fork_event)) {
/*
* The fork thread didn't abort, so we can
@@ -1180,7 +1190,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
if (ret)
userfaultfd_ctx_put(fork_nctx);
}
- spin_unlock(&ctx->event_wqh.lock);
+ spin_unlock_irq(&ctx->event_wqh.lock);
}
return ret;
@@ -1219,14 +1229,14 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
static void __wake_userfault(struct userfaultfd_ctx *ctx,
struct userfaultfd_wake_range *range)
{
- spin_lock(&ctx->fault_pending_wqh.lock);
+ spin_lock_irq(&ctx->fault_pending_wqh.lock);
/* wake all in the range and autoremove */
if (waitqueue_active(&ctx->fault_pending_wqh))
__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
range);
if (waitqueue_active(&ctx->fault_wqh))
__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
- spin_unlock(&ctx->fault_pending_wqh.lock);
+ spin_unlock_irq(&ctx->fault_pending_wqh.lock);
}
static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
@@ -1881,7 +1891,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
wait_queue_entry_t *wq;
unsigned long pending = 0, total = 0;
- spin_lock(&ctx->fault_pending_wqh.lock);
+ spin_lock_irq(&ctx->fault_pending_wqh.lock);
list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
pending++;
total++;
@@ -1889,7 +1899,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
total++;
}
- spin_unlock(&ctx->fault_pending_wqh.lock);
+ spin_unlock_irq(&ctx->fault_pending_wqh.lock);
/*
* If more protocols will be added, there will be all shown