diff options
Diffstat (limited to 'fs/notify/fanotify/fanotify_user.c')
| -rw-r--r-- | fs/notify/fanotify/fanotify_user.c | 1762 |
1 files changed, 1380 insertions, 382 deletions
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 63b5dffdca9e..d0b9b984002f 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -22,21 +22,179 @@ #include <asm/ioctls.h> -#include "../../mount.h" +#include "../fsnotify.h" #include "../fdinfo.h" #include "fanotify.h" #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 -#define FANOTIFY_DEFAULT_MAX_MARKS 8192 -#define FANOTIFY_DEFAULT_MAX_LISTENERS 128 +#define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192 +#define FANOTIFY_DEFAULT_MAX_GROUPS 128 +#define FANOTIFY_DEFAULT_FEE_POOL_SIZE 32 + +/* + * Legacy fanotify marks limits (8192) is per group and we introduced a tunable + * limit of marks per user, similar to inotify. Effectively, the legacy limit + * of fanotify marks per user is <max marks per group> * <max groups per user>. + * This default limit (1M) also happens to match the increased limit of inotify + * max_user_watches since v5.10. + */ +#define FANOTIFY_DEFAULT_MAX_USER_MARKS \ + (FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS) + +/* + * Most of the memory cost of adding an inode mark is pinning the marked inode. + * The size of the filesystem inode struct is not uniform across filesystems, + * so double the size of a VFS inode is used as a conservative approximation. + */ +#define INODE_MARK_COST (2 * sizeof(struct inode)) + +/* configurable via /proc/sys/fs/fanotify/ */ +static int fanotify_max_queued_events __read_mostly; +static int perm_group_timeout __read_mostly; + +#ifdef CONFIG_SYSCTL + +#include <linux/sysctl.h> + +static long ft_zero = 0; +static long ft_int_max = INT_MAX; + +static const struct ctl_table fanotify_table[] = { + { + .procname = "max_user_groups", + .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS], + .maxlen = sizeof(long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &ft_zero, + .extra2 = &ft_int_max, + }, + { + .procname = "max_user_marks", + .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS], + .maxlen = sizeof(long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &ft_zero, + .extra2 = &ft_int_max, + }, + { + .procname = "max_queued_events", + .data = &fanotify_max_queued_events, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO + }, + { + .procname = "watchdog_timeout", + .data = &perm_group_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +}; + +static void __init fanotify_sysctls_init(void) +{ + register_sysctl("fs/fanotify", fanotify_table); +} +#else +#define fanotify_sysctls_init() do { } while (0) +#endif /* CONFIG_SYSCTL */ + +static LIST_HEAD(perm_group_list); +static DEFINE_SPINLOCK(perm_group_lock); +static void perm_group_watchdog(struct work_struct *work); +static DECLARE_DELAYED_WORK(perm_group_work, perm_group_watchdog); + +static void perm_group_watchdog_schedule(void) +{ + schedule_delayed_work(&perm_group_work, secs_to_jiffies(perm_group_timeout)); +} + +static void perm_group_watchdog(struct work_struct *work) +{ + struct fsnotify_group *group; + struct fanotify_perm_event *event; + struct task_struct *task; + pid_t failed_pid = 0; + + guard(spinlock)(&perm_group_lock); + if (list_empty(&perm_group_list)) + return; + + list_for_each_entry(group, &perm_group_list, + fanotify_data.perm_grp_list) { + /* + * Ok to test without lock, racing with an addition is + * fine, will deal with it next round + */ + if (list_empty(&group->fanotify_data.access_list)) + continue; + + spin_lock(&group->notification_lock); + list_for_each_entry(event, &group->fanotify_data.access_list, + fae.fse.list) { + if (likely(event->watchdog_cnt == 0)) { + event->watchdog_cnt = 1; + } else if (event->watchdog_cnt == 1) { + /* Report on event only once */ + event->watchdog_cnt = 2; + + /* Do not report same pid repeatedly */ + if (event->recv_pid == failed_pid) + continue; + + failed_pid = event->recv_pid; + rcu_read_lock(); + task = find_task_by_pid_ns(event->recv_pid, + &init_pid_ns); + pr_warn_ratelimited( + "PID %u (%s) failed to respond to fanotify queue for more than %d seconds\n", + event->recv_pid, + task ? task->comm : NULL, + perm_group_timeout); + rcu_read_unlock(); + } + } + spin_unlock(&group->notification_lock); + } + perm_group_watchdog_schedule(); +} + +static void fanotify_perm_watchdog_group_remove(struct fsnotify_group *group) +{ + if (!list_empty(&group->fanotify_data.perm_grp_list)) { + /* Perm event watchdog can no longer scan this group. */ + spin_lock(&perm_group_lock); + list_del_init(&group->fanotify_data.perm_grp_list); + spin_unlock(&perm_group_lock); + } +} + +static void fanotify_perm_watchdog_group_add(struct fsnotify_group *group) +{ + if (!perm_group_timeout) + return; + + spin_lock(&perm_group_lock); + if (list_empty(&group->fanotify_data.perm_grp_list)) { + /* Add to perm_group_list for monitoring by watchdog. */ + if (list_empty(&perm_group_list)) + perm_group_watchdog_schedule(); + list_add_tail(&group->fanotify_data.perm_grp_list, &perm_group_list); + } + spin_unlock(&perm_group_lock); +} /* * All flags that may be specified in parameter event_f_flags of fanotify_init. * * Internal and external open flags are stored together in field f_flags of * struct file. Only external open flags shall be allowed in event_f_flags. - * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be - * excluded. + * Internal flags like FMODE_EXEC shall be excluded. */ #define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \ O_ACCMODE | O_APPEND | O_NONBLOCK | \ @@ -45,14 +203,23 @@ extern const struct fsnotify_ops fanotify_fsnotify_ops; -struct kmem_cache *fanotify_mark_cache __read_mostly; -struct kmem_cache *fanotify_fid_event_cachep __read_mostly; -struct kmem_cache *fanotify_path_event_cachep __read_mostly; -struct kmem_cache *fanotify_perm_event_cachep __read_mostly; +struct kmem_cache *fanotify_mark_cache __ro_after_init; +struct kmem_cache *fanotify_fid_event_cachep __ro_after_init; +struct kmem_cache *fanotify_path_event_cachep __ro_after_init; +struct kmem_cache *fanotify_perm_event_cachep __ro_after_init; +struct kmem_cache *fanotify_mnt_event_cachep __ro_after_init; #define FANOTIFY_EVENT_ALIGN 4 -#define FANOTIFY_INFO_HDR_LEN \ +#define FANOTIFY_FID_INFO_HDR_LEN \ (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) +#define FANOTIFY_PIDFD_INFO_LEN \ + sizeof(struct fanotify_event_info_pidfd) +#define FANOTIFY_ERROR_INFO_LEN \ + (sizeof(struct fanotify_event_info_error)) +#define FANOTIFY_RANGE_INFO_LEN \ + (sizeof(struct fanotify_event_info_range)) +#define FANOTIFY_MNT_INFO_LEN \ + (sizeof(struct fanotify_event_info_mnt)) static int fanotify_fid_info_len(int fh_len, int name_len) { @@ -61,25 +228,80 @@ static int fanotify_fid_info_len(int fh_len, int name_len) if (name_len) info_len += name_len + 1; - return roundup(FANOTIFY_INFO_HDR_LEN + info_len, FANOTIFY_EVENT_ALIGN); + return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len, + FANOTIFY_EVENT_ALIGN); } -static int fanotify_event_info_len(struct fanotify_event *event) +/* FAN_RENAME may have one or two dir+name info records */ +static int fanotify_dir_name_info_len(struct fanotify_event *event) { + struct fanotify_info *info = fanotify_event_info(event); + int dir_fh_len = fanotify_event_dir_fh_len(event); + int dir2_fh_len = fanotify_event_dir2_fh_len(event); int info_len = 0; - int fh_len = fanotify_event_object_fh_len(event); - if (fh_len) - info_len += fanotify_fid_info_len(fh_len, 0); + if (dir_fh_len) + info_len += fanotify_fid_info_len(dir_fh_len, + info->name_len); + if (dir2_fh_len) + info_len += fanotify_fid_info_len(dir2_fh_len, + info->name2_len); - if (fanotify_event_name_len(event)) { - struct fanotify_name_event *fne = FANOTIFY_NE(event); + return info_len; +} + +static size_t fanotify_event_len(unsigned int info_mode, + struct fanotify_event *event) +{ + size_t event_len = FAN_EVENT_METADATA_LEN; + int fh_len; + int dot_len = 0; - info_len += fanotify_fid_info_len(fne->dir_fh.len, - fne->name_len); + if (fanotify_is_error_event(event->mask)) + event_len += FANOTIFY_ERROR_INFO_LEN; + + if (fanotify_event_has_any_dir_fh(event)) { + event_len += fanotify_dir_name_info_len(event); + } else if ((info_mode & FAN_REPORT_NAME) && + (event->mask & FAN_ONDIR)) { + /* + * With group flag FAN_REPORT_NAME, if name was not recorded in + * event on a directory, we will report the name ".". + */ + dot_len = 1; } - return info_len; + if (fanotify_event_has_object_fh(event)) { + fh_len = fanotify_event_object_fh_len(event); + event_len += fanotify_fid_info_len(fh_len, dot_len); + } + if (fanotify_is_mnt_event(event->mask)) + event_len += FANOTIFY_MNT_INFO_LEN; + + if (info_mode & FAN_REPORT_PIDFD) + event_len += FANOTIFY_PIDFD_INFO_LEN; + + if (fanotify_event_has_access_range(event)) + event_len += FANOTIFY_RANGE_INFO_LEN; + + return event_len; +} + +/* + * Remove an hashed event from merge hash table. + */ +static void fanotify_unhash_event(struct fsnotify_group *group, + struct fanotify_event *event) +{ + assert_spin_locked(&group->notification_lock); + + pr_debug("%s: group=%p event=%p bucket=%u\n", __func__, + group, event, fanotify_event_hash_bucket(group, event)); + + if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list))) + return; + + hlist_del_init(&event->merge_list); } /* @@ -91,33 +313,41 @@ static int fanotify_event_info_len(struct fanotify_event *event) static struct fanotify_event *get_one_event(struct fsnotify_group *group, size_t count) { - size_t event_size = FAN_EVENT_METADATA_LEN; + size_t event_size; struct fanotify_event *event = NULL; + struct fsnotify_event *fsn_event; + unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); pr_debug("%s: group=%p count=%zd\n", __func__, group, count); spin_lock(&group->notification_lock); - if (fsnotify_notify_queue_is_empty(group)) + fsn_event = fsnotify_peek_first_event(group); + if (!fsn_event) goto out; - if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { - event_size += fanotify_event_info_len( - FANOTIFY_E(fsnotify_peek_first_event(group))); - } + event = FANOTIFY_E(fsn_event); + event_size = fanotify_event_len(info_mode, event); if (event_size > count) { event = ERR_PTR(-EINVAL); goto out; } - event = FANOTIFY_E(fsnotify_remove_first_event(group)); + + /* + * Held the notification_lock the whole time, so this is the + * same event we peeked above. + */ + fsnotify_remove_first_event(group); if (fanotify_is_perm_event(event->mask)) FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED; + if (fanotify_is_hashed_event(event->mask)) + fanotify_unhash_event(group, event); out: spin_unlock(&group->notification_lock); return event; } -static int create_fd(struct fsnotify_group *group, struct path *path, +static int create_fd(struct fsnotify_group *group, const struct path *path, struct file **file) { int client_fd; @@ -128,20 +358,12 @@ static int create_fd(struct fsnotify_group *group, struct path *path, return client_fd; /* - * we need a new file handle for the userspace program so it can read even if it was - * originally opened O_WRONLY. + * We provide an fd for the userspace program, so it could access the + * file without generating fanotify events itself. */ - new_file = dentry_open(path, - group->fanotify_data.f_flags | FMODE_NONOTIFY, - current_cred()); + new_file = dentry_open_nonotify(path, group->fanotify_data.f_flags, + current_cred()); if (IS_ERR(new_file)) { - /* - * we still send an event even if we can't open the file. this - * can happen when say tasks are gone and we try to open their - * /proc files or we try to open a WRONLY file like in sysfs - * we just send the errno to userspace since there isn't much - * else we can do. - */ put_unused_fd(client_fd); client_fd = PTR_ERR(new_file); } else { @@ -151,19 +373,42 @@ static int create_fd(struct fsnotify_group *group, struct path *path, return client_fd; } +static int process_access_response_info(const char __user *info, + size_t info_len, + struct fanotify_response_info_audit_rule *friar) +{ + if (info_len != sizeof(*friar)) + return -EINVAL; + + if (copy_from_user(friar, info, sizeof(*friar))) + return -EFAULT; + + if (friar->hdr.type != FAN_RESPONSE_INFO_AUDIT_RULE) + return -EINVAL; + if (friar->hdr.pad != 0) + return -EINVAL; + if (friar->hdr.len != sizeof(*friar)) + return -EINVAL; + + return info_len; +} + /* * Finish processing of permission event by setting it to ANSWERED state and * drop group->notification_lock. */ static void finish_permission_event(struct fsnotify_group *group, - struct fanotify_perm_event *event, - unsigned int response) + struct fanotify_perm_event *event, u32 response, + struct fanotify_response_info_audit_rule *friar) __releases(&group->notification_lock) { bool destroy = false; assert_spin_locked(&group->notification_lock); - event->response = response; + event->response = response & ~FAN_INFO; + if (response & FAN_INFO) + memcpy(&event->audit_rule, friar, sizeof(*friar)); + if (event->state == FAN_EVENT_CANCELED) destroy = true; else @@ -174,31 +419,73 @@ static void finish_permission_event(struct fsnotify_group *group, } static int process_access_response(struct fsnotify_group *group, - struct fanotify_response *response_struct) + struct fanotify_response *response_struct, + const char __user *info, + size_t info_len) { struct fanotify_perm_event *event; int fd = response_struct->fd; - int response = response_struct->response; + u32 response = response_struct->response; + int errno = fanotify_get_response_errno(response); + int ret = info_len; + struct fanotify_response_info_audit_rule friar; - pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group, - fd, response); + pr_debug("%s: group=%p fd=%d response=%x errno=%d buf=%p size=%zu\n", + __func__, group, fd, response, errno, info, info_len); /* * make sure the response is valid, if invalid we do nothing and either * userspace can send a valid response or we will clean it up after the * timeout */ - switch (response & ~FAN_AUDIT) { + if (response & ~FANOTIFY_RESPONSE_VALID_MASK) + return -EINVAL; + + switch (response & FANOTIFY_RESPONSE_ACCESS) { case FAN_ALLOW: + if (errno) + return -EINVAL; + break; case FAN_DENY: + /* Custom errno is supported only for pre-content groups */ + if (errno && group->priority != FSNOTIFY_PRIO_PRE_CONTENT) + return -EINVAL; + + /* + * Limit errno to values expected on open(2)/read(2)/write(2) + * of regular files. + */ + switch (errno) { + case 0: + case EIO: + case EPERM: + case EBUSY: + case ETXTBSY: + case EAGAIN: + case ENOSPC: + case EDQUOT: + break; + default: + return -EINVAL; + } break; default: return -EINVAL; } - if (fd < 0) + if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT)) return -EINVAL; - if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT)) + if (response & FAN_INFO) { + ret = process_access_response_info(info, info_len, &friar); + if (ret < 0) + return ret; + if (fd == FAN_NOFD) + return ret; + } else { + ret = 0; + } + + if (fd < 0) return -EINVAL; spin_lock(&group->notification_lock); @@ -208,18 +495,59 @@ static int process_access_response(struct fsnotify_group *group, continue; list_del_init(&event->fae.fse.list); - finish_permission_event(group, event, response); + finish_permission_event(group, event, response, &friar); wake_up(&group->fanotify_data.access_waitq); - return 0; + return ret; } spin_unlock(&group->notification_lock); return -ENOENT; } -static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, - const char *name, size_t name_len, - char __user *buf, size_t count) +static size_t copy_mnt_info_to_user(struct fanotify_event *event, + char __user *buf, int count) +{ + struct fanotify_event_info_mnt info = { }; + + info.hdr.info_type = FAN_EVENT_INFO_TYPE_MNT; + info.hdr.len = FANOTIFY_MNT_INFO_LEN; + + if (WARN_ON(count < info.hdr.len)) + return -EFAULT; + + info.mnt_id = FANOTIFY_ME(event)->mnt_id; + + if (copy_to_user(buf, &info, sizeof(info))) + return -EFAULT; + + return info.hdr.len; +} + +static size_t copy_error_info_to_user(struct fanotify_event *event, + char __user *buf, int count) +{ + struct fanotify_event_info_error info = { }; + struct fanotify_error_event *fee = FANOTIFY_EE(event); + + info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR; + info.hdr.len = FANOTIFY_ERROR_INFO_LEN; + + if (WARN_ON(count < info.hdr.len)) + return -EFAULT; + + info.error = fee->error; + info.error_count = fee->err_count; + + if (copy_to_user(buf, &info, sizeof(info))) + return -EFAULT; + + return info.hdr.len; +} + +static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, + int info_type, const char *name, + size_t name_len, + char __user *buf, size_t count) { struct fanotify_event_info_fid info = { }; struct file_handle handle = { }; @@ -231,9 +559,6 @@ static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n", __func__, fh_len, name_len, info_len, count); - if (!fh_len || (name && !name_len)) - return 0; - if (WARN_ON_ONCE(len < sizeof(info) || len > count)) return -EFAULT; @@ -241,8 +566,23 @@ static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, * Copy event info fid header followed by variable sized file handle * and optionally followed by variable sized filename. */ - info.hdr.info_type = name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : - FAN_EVENT_INFO_TYPE_FID; + switch (info_type) { + case FAN_EVENT_INFO_TYPE_FID: + case FAN_EVENT_INFO_TYPE_DFID: + if (WARN_ON_ONCE(name_len)) + return -EFAULT; + break; + case FAN_EVENT_INFO_TYPE_DFID_NAME: + case FAN_EVENT_INFO_TYPE_OLD_DFID_NAME: + case FAN_EVENT_INFO_TYPE_NEW_DFID_NAME: + if (WARN_ON_ONCE(!name || !name_len)) + return -EFAULT; + break; + default: + return -EFAULT; + } + + info.hdr.info_type = info_type; info.hdr.len = len; info.fsid = *fsid; if (copy_to_user(buf, &info, sizeof(info))) @@ -255,6 +595,11 @@ static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, handle.handle_type = fh->type; handle.handle_bytes = fh_len; + + /* Mangle handle_type for bad file_handle */ + if (!fh_len) + handle.handle_type = FILEID_INVALID; + if (copy_to_user(buf, &handle, sizeof(handle))) return -EFAULT; @@ -299,31 +644,288 @@ static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, return info_len; } +static int copy_pidfd_info_to_user(int pidfd, + char __user *buf, + size_t count) +{ + struct fanotify_event_info_pidfd info = { }; + size_t info_len = FANOTIFY_PIDFD_INFO_LEN; + + if (WARN_ON_ONCE(info_len > count)) + return -EFAULT; + + info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; + info.hdr.len = info_len; + info.pidfd = pidfd; + + if (copy_to_user(buf, &info, info_len)) + return -EFAULT; + + return info_len; +} + +static size_t copy_range_info_to_user(struct fanotify_event *event, + char __user *buf, int count) +{ + struct fanotify_perm_event *pevent = FANOTIFY_PERM(event); + struct fanotify_event_info_range info = { }; + size_t info_len = FANOTIFY_RANGE_INFO_LEN; + + if (WARN_ON_ONCE(info_len > count)) + return -EFAULT; + + if (WARN_ON_ONCE(!pevent->ppos)) + return -EINVAL; + + info.hdr.info_type = FAN_EVENT_INFO_TYPE_RANGE; + info.hdr.len = info_len; + info.offset = *(pevent->ppos); + info.count = pevent->count; + + if (copy_to_user(buf, &info, info_len)) + return -EFAULT; + + return info_len; +} + +static int copy_info_records_to_user(struct fanotify_event *event, + struct fanotify_info *info, + unsigned int info_mode, int pidfd, + char __user *buf, size_t count) +{ + int ret, total_bytes = 0, info_type = 0; + unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS; + unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; + + /* + * Event info records order is as follows: + * 1. dir fid + name + * 2. (optional) new dir fid + new name + * 3. (optional) child fid + */ + if (fanotify_event_has_dir_fh(event)) { + info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : + FAN_EVENT_INFO_TYPE_DFID; + + /* FAN_RENAME uses special info types */ + if (event->mask & FAN_RENAME) + info_type = FAN_EVENT_INFO_TYPE_OLD_DFID_NAME; + + ret = copy_fid_info_to_user(fanotify_event_fsid(event), + fanotify_info_dir_fh(info), + info_type, + fanotify_info_name(info), + info->name_len, buf, count); + if (ret < 0) + return ret; + + buf += ret; + count -= ret; + total_bytes += ret; + } + + /* New dir fid+name may be reported in addition to old dir fid+name */ + if (fanotify_event_has_dir2_fh(event)) { + info_type = FAN_EVENT_INFO_TYPE_NEW_DFID_NAME; + ret = copy_fid_info_to_user(fanotify_event_fsid(event), + fanotify_info_dir2_fh(info), + info_type, + fanotify_info_name2(info), + info->name2_len, buf, count); + if (ret < 0) + return ret; + + buf += ret; + count -= ret; + total_bytes += ret; + } + + if (fanotify_event_has_object_fh(event)) { + const char *dot = NULL; + int dot_len = 0; + + if (fid_mode == FAN_REPORT_FID || info_type) { + /* + * With only group flag FAN_REPORT_FID only type FID is + * reported. Second info record type is always FID. + */ + info_type = FAN_EVENT_INFO_TYPE_FID; + } else if ((fid_mode & FAN_REPORT_NAME) && + (event->mask & FAN_ONDIR)) { + /* + * With group flag FAN_REPORT_NAME, if name was not + * recorded in an event on a directory, report the name + * "." with info type DFID_NAME. + */ + info_type = FAN_EVENT_INFO_TYPE_DFID_NAME; + dot = "."; + dot_len = 1; + } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) || + (event->mask & FAN_ONDIR)) { + /* + * With group flag FAN_REPORT_DIR_FID, a single info + * record has type DFID for directory entry modification + * event and for event on a directory. + */ + info_type = FAN_EVENT_INFO_TYPE_DFID; + } else { + /* + * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID, + * a single info record has type FID for event on a + * non-directory, when there is no directory to report. + * For example, on FAN_DELETE_SELF event. + */ + info_type = FAN_EVENT_INFO_TYPE_FID; + } + + ret = copy_fid_info_to_user(fanotify_event_fsid(event), + fanotify_event_object_fh(event), + info_type, dot, dot_len, + buf, count); + if (ret < 0) + return ret; + + buf += ret; + count -= ret; + total_bytes += ret; + } + + if (pidfd_mode) { + ret = copy_pidfd_info_to_user(pidfd, buf, count); + if (ret < 0) + return ret; + + buf += ret; + count -= ret; + total_bytes += ret; + } + + if (fanotify_is_error_event(event->mask)) { + ret = copy_error_info_to_user(event, buf, count); + if (ret < 0) + return ret; + buf += ret; + count -= ret; + total_bytes += ret; + } + + if (fanotify_event_has_access_range(event)) { + ret = copy_range_info_to_user(event, buf, count); + if (ret < 0) + return ret; + buf += ret; + count -= ret; + total_bytes += ret; + } + + if (fanotify_is_mnt_event(event->mask)) { + ret = copy_mnt_info_to_user(event, buf, count); + if (ret < 0) + return ret; + buf += ret; + count -= ret; + total_bytes += ret; + } + + return total_bytes; +} + static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fanotify_event *event, char __user *buf, size_t count) { struct fanotify_event_metadata metadata; - struct path *path = fanotify_event_path(event); - struct file *f = NULL; - int ret, fd = FAN_NOFD; + const struct path *path = fanotify_event_path(event); + struct fanotify_info *info = fanotify_event_info(event); + unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); + unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; + struct file *f = NULL, *pidfd_file = NULL; + int ret, pidfd = -ESRCH, fd = -EBADF; pr_debug("%s: group=%p event=%p\n", __func__, group, event); - metadata.event_len = FAN_EVENT_METADATA_LEN + - fanotify_event_info_len(event); + metadata.event_len = fanotify_event_len(info_mode, event); metadata.metadata_len = FAN_EVENT_METADATA_LEN; metadata.vers = FANOTIFY_METADATA_VERSION; metadata.reserved = 0; metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS; metadata.pid = pid_vnr(event->pid); + /* + * For an unprivileged listener, event->pid can be used to identify the + * events generated by the listener process itself, without disclosing + * the pids of other processes. + */ + if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) && + task_tgid(current) != event->pid) + metadata.pid = 0; - if (path && path->mnt && path->dentry) { + /* + * For now, fid mode is required for an unprivileged listener and + * fid mode does not report fd in events. Keep this check anyway + * for safety in case fid mode requirement is relaxed in the future + * to allow unprivileged listener to get events with no fd and no fid. + */ + if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) && + path && path->mnt && path->dentry) { fd = create_fd(group, path, &f); - if (fd < 0) - return fd; + /* + * Opening an fd from dentry can fail for several reasons. + * For example, when tasks are gone and we try to open their + * /proc files or we try to open a WRONLY file like in sysfs + * or when trying to open a file that was deleted on the + * remote network server. + * + * For a group with FAN_REPORT_FD_ERROR, we will send the + * event with the error instead of the open fd, otherwise + * Userspace may not get the error at all. + * In any case, userspace will not know which file failed to + * open, so add a debug print for further investigation. + */ + if (fd < 0) { + pr_debug("fanotify: create_fd(%pd2) failed err=%d\n", + path->dentry, fd); + if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR)) { + /* + * Historically, we've handled EOPENSTALE in a + * special way and silently dropped such + * events. Now we have to keep it to maintain + * backward compatibility... + */ + if (fd == -EOPENSTALE) + fd = 0; + return fd; + } + } + } + if (FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR)) + metadata.fd = fd; + else + metadata.fd = fd >= 0 ? fd : FAN_NOFD; + + if (pidfd_mode) { + /* + * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual + * exclusion is ever lifted. At the time of incoporating pidfd + * support within fanotify, the pidfd API only supported the + * creation of pidfds for thread-group leaders. + */ + WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID)); + + /* + * The PIDTYPE_TGID check for an event->pid is performed + * preemptively in an attempt to catch out cases where the event + * listener reads events after the event generating process has + * already terminated. Depending on flag FAN_REPORT_FD_ERROR, + * report either -ESRCH or FAN_NOPIDFD to the event listener in + * those cases with all other pidfd creation errors reported as + * the error code itself or as FAN_EPIDFD. + */ + if (metadata.pid && pid_has_task(event->pid, PIDTYPE_TGID)) + pidfd = pidfd_prepare(event->pid, 0, &pidfd_file); + + if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR) && pidfd < 0) + pidfd = pidfd == -ESRCH ? FAN_NOPIDFD : FAN_EPIDFD; } - metadata.fd = fd; ret = -EFAULT; /* @@ -339,45 +941,33 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, buf += FAN_EVENT_METADATA_LEN; count -= FAN_EVENT_METADATA_LEN; - if (fanotify_is_perm_event(event->mask)) - FANOTIFY_PERM(event)->fd = fd; + ret = copy_info_records_to_user(event, info, info_mode, pidfd, + buf, count); + if (ret < 0) + goto out_close_fd; if (f) fd_install(fd, f); - /* Event info records order is: dir fid + name, child fid */ - if (fanotify_event_name_len(event)) { - struct fanotify_name_event *fne = FANOTIFY_NE(event); - - ret = copy_info_to_user(fanotify_event_fsid(event), - fanotify_event_dir_fh(event), - fne->name, fne->name_len, - buf, count); - if (ret < 0) - return ret; + if (pidfd_file) + fd_install(pidfd, pidfd_file); - buf += ret; - count -= ret; - } - - if (fanotify_event_object_fh_len(event)) { - ret = copy_info_to_user(fanotify_event_fsid(event), - fanotify_event_object_fh(event), - NULL, 0, buf, count); - if (ret < 0) - return ret; - - buf += ret; - count -= ret; - } + if (fanotify_is_perm_event(event->mask)) + FANOTIFY_PERM(event)->fd = fd; return metadata.event_len; out_close_fd: - if (fd != FAN_NOFD) { + if (f) { put_unused_fd(fd); fput(f); } + + if (pidfd_file) { + put_unused_fd(pidfd); + fput(pidfd_file); + } + return ret; } @@ -412,6 +1002,11 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, add_wait_queue(&group->notification_waitq, &wait); while (1) { + /* + * User can supply arbitrarily large buffer. Avoid softlockups + * in case there are lots of available events. + */ + cond_resched(); event = get_one_event(group, count); if (IS_ERR(event)) { ret = PTR_ERR(event); @@ -435,15 +1030,6 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, } ret = copy_event_to_user(group, event, buf, count); - if (unlikely(ret == -EOPENSTALE)) { - /* - * We cannot report events with stale fd so drop it. - * Setting ret to 0 will continue the event loop and - * do the right thing if there are no more events to - * read (i.e. return bytes read, -EAGAIN or wait). - */ - ret = 0; - } /* * Permission events get queued to wait for response. Other @@ -452,15 +1038,16 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, if (!fanotify_is_perm_event(event->mask)) { fsnotify_destroy_event(group, &event->fse); } else { - if (ret <= 0) { + if (ret <= 0 || FANOTIFY_PERM(event)->fd < 0) { spin_lock(&group->notification_lock); finish_permission_event(group, - FANOTIFY_PERM(event), FAN_DENY); + FANOTIFY_PERM(event), FAN_DENY, NULL); wake_up(&group->fanotify_data.access_waitq); } else { spin_lock(&group->notification_lock); list_add_tail(&event->fse.list, &group->fanotify_data.access_list); + FANOTIFY_PERM(event)->recv_pid = current->pid; spin_unlock(&group->notification_lock); } } @@ -478,28 +1065,32 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - struct fanotify_response response = { .fd = -1, .response = -1 }; + struct fanotify_response response; struct fsnotify_group *group; int ret; + const char __user *info_buf = buf + sizeof(struct fanotify_response); + size_t info_len; if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) return -EINVAL; group = file->private_data; + pr_debug("%s: group=%p count=%zu\n", __func__, group, count); + if (count < sizeof(response)) return -EINVAL; - count = sizeof(response); - - pr_debug("%s: group=%p count=%zu\n", __func__, group, count); - - if (copy_from_user(&response, buf, count)) + if (copy_from_user(&response, buf, sizeof(response))) return -EFAULT; - ret = process_access_response(group, &response); + info_len = count - sizeof(response); + + ret = process_access_response(group, &response, info_buf, info_len); if (ret < 0) count = ret; + else + count = sizeof(response) + ret; return count; } @@ -507,6 +1098,7 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t static int fanotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; + struct fsnotify_event *fsn_event; /* * Stop new events from arriving in the notification queue. since @@ -515,6 +1107,8 @@ static int fanotify_release(struct inode *ignored, struct file *file) */ fsnotify_group_stop_queueing(group); + fanotify_perm_watchdog_group_remove(group); + /* * Process all permission events on access_list and notification queue * and simulate reply from userspace. @@ -526,7 +1120,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) event = list_first_entry(&group->fanotify_data.access_list, struct fanotify_perm_event, fae.fse.list); list_del_init(&event->fae.fse.list); - finish_permission_event(group, event, FAN_ALLOW); + finish_permission_event(group, event, FAN_ALLOW, NULL); spin_lock(&group->notification_lock); } @@ -535,16 +1129,15 @@ static int fanotify_release(struct inode *ignored, struct file *file) * dequeue them and set the response. They will be freed once the * response is consumed and fanotify_get_response() returns. */ - while (!fsnotify_notify_queue_is_empty(group)) { - struct fanotify_event *event; + while ((fsn_event = fsnotify_remove_first_event(group))) { + struct fanotify_event *event = FANOTIFY_E(fsn_event); - event = FANOTIFY_E(fsnotify_remove_first_event(group)); if (!(event->mask & FANOTIFY_PERM_EVENTS)) { spin_unlock(&group->notification_lock); - fsnotify_destroy_event(group, &event->fse); + fsnotify_destroy_event(group, fsn_event); } else { finish_permission_event(group, FANOTIFY_PERM(event), - FAN_ALLOW); + FAN_ALLOW, NULL); } spin_lock(&group->notification_lock); } @@ -606,22 +1199,17 @@ static int fanotify_find_path(int dfd, const char __user *filename, dfd, filename, flags); if (filename == NULL) { - struct fd f = fdget(dfd); + CLASS(fd, f)(dfd); - ret = -EBADF; - if (!f.file) - goto out; + if (fd_empty(f)) + return -EBADF; - ret = -ENOTDIR; if ((flags & FAN_MARK_ONLYDIR) && - !(S_ISDIR(file_inode(f.file)->i_mode))) { - fdput(f); - goto out; - } + !(S_ISDIR(file_inode(fd_file(f))->i_mode))) + return -ENOTDIR; - *path = f.file->f_path; + *path = fd_file(f)->f_path; path_get(path); - fdput(f); } else { unsigned int lookup_flags = 0; @@ -636,7 +1224,7 @@ static int fanotify_find_path(int dfd, const char __user *filename, } /* you can only watch an inode if you have read permissions on it */ - ret = inode_permission(path->dentry->d_inode, MAY_READ); + ret = path_permission(path, MAY_READ); if (ret) { path_put(path); goto out; @@ -651,47 +1239,55 @@ out: } static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, - __u32 mask, - unsigned int flags, - int *destroy) + __u32 mask, unsigned int flags, + __u32 umask, int *destroy) { - __u32 oldmask = 0; + __u32 oldmask, newmask; + /* umask bits cannot be removed by user */ + mask &= ~umask; spin_lock(&fsn_mark->lock); - if (!(flags & FAN_MARK_IGNORED_MASK)) { - oldmask = fsn_mark->mask; + oldmask = fsnotify_calc_mask(fsn_mark); + if (!(flags & FANOTIFY_MARK_IGNORE_BITS)) { fsn_mark->mask &= ~mask; } else { - fsn_mark->ignored_mask &= ~mask; + fsn_mark->ignore_mask &= ~mask; } - *destroy = !(fsn_mark->mask | fsn_mark->ignored_mask); + newmask = fsnotify_calc_mask(fsn_mark); + /* + * We need to keep the mark around even if remaining mask cannot + * result in any events (e.g. mask == FAN_ONDIR) to support incremenal + * changes to the mask. + * Destroy mark when only umask bits remain. + */ + *destroy = !((fsn_mark->mask | fsn_mark->ignore_mask) & ~umask); spin_unlock(&fsn_mark->lock); - return mask & oldmask; + return oldmask & ~newmask; } static int fanotify_remove_mark(struct fsnotify_group *group, - fsnotify_connp_t *connp, __u32 mask, - unsigned int flags) + void *obj, unsigned int obj_type, __u32 mask, + unsigned int flags, __u32 umask) { struct fsnotify_mark *fsn_mark = NULL; __u32 removed; int destroy_mark; - mutex_lock(&group->mark_mutex); - fsn_mark = fsnotify_find_mark(connp, group); + fsnotify_group_lock(group); + fsn_mark = fsnotify_find_mark(obj, obj_type, group); if (!fsn_mark) { - mutex_unlock(&group->mark_mutex); + fsnotify_group_unlock(group); return -ENOENT; } removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, - &destroy_mark); + umask, &destroy_mark); if (removed & fsnotify_conn_mask(fsn_mark->connector)) fsnotify_recalc_mask(fsn_mark->connector); if (destroy_mark) fsnotify_detach_mark(fsn_mark); - mutex_unlock(&group->mark_mutex); + fsnotify_group_unlock(group); if (destroy_mark) fsnotify_free_mark(fsn_mark); @@ -700,150 +1296,343 @@ static int fanotify_remove_mark(struct fsnotify_group *group, return 0; } -static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, - struct vfsmount *mnt, __u32 mask, - unsigned int flags) +static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark, + unsigned int fan_flags) { - return fanotify_remove_mark(group, &real_mount(mnt)->mnt_fsnotify_marks, - mask, flags); -} + bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE); + unsigned int ignore = fan_flags & FANOTIFY_MARK_IGNORE_BITS; + bool recalc = false; -static int fanotify_remove_sb_mark(struct fsnotify_group *group, - struct super_block *sb, __u32 mask, - unsigned int flags) -{ - return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask, flags); -} + /* + * When using FAN_MARK_IGNORE for the first time, mark starts using + * independent event flags in ignore mask. After that, trying to + * update the ignore mask with the old FAN_MARK_IGNORED_MASK API + * will result in EEXIST error. + */ + if (ignore == FAN_MARK_IGNORE) + fsn_mark->flags |= FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS; -static int fanotify_remove_inode_mark(struct fsnotify_group *group, - struct inode *inode, __u32 mask, - unsigned int flags) -{ - return fanotify_remove_mark(group, &inode->i_fsnotify_marks, mask, - flags); + /* + * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to + * the removal of the FS_MODIFY bit in calculated mask if it was set + * because of an ignore mask that is now going to survive FS_MODIFY. + */ + if (ignore && (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && + !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) { + fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; + if (!(fsn_mark->mask & FS_MODIFY)) + recalc = true; + } + + if (fsn_mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE || + want_iref == !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) + return recalc; + + /* + * NO_IREF may be removed from a mark, but not added. + * When removed, fsnotify_recalc_mask() will take the inode ref. + */ + WARN_ON_ONCE(!want_iref); + fsn_mark->flags &= ~FSNOTIFY_MARK_FLAG_NO_IREF; + + return true; } -static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, - __u32 mask, - unsigned int flags) +static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, + __u32 mask, unsigned int fan_flags) { - __u32 oldmask = -1; + bool recalc; spin_lock(&fsn_mark->lock); - if (!(flags & FAN_MARK_IGNORED_MASK)) { - oldmask = fsn_mark->mask; + if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS)) fsn_mark->mask |= mask; - } else { - fsn_mark->ignored_mask |= mask; - if (flags & FAN_MARK_IGNORED_SURV_MODIFY) - fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; - } + else + fsn_mark->ignore_mask |= mask; + + recalc = fsnotify_calc_mask(fsn_mark) & + ~fsnotify_conn_mask(fsn_mark->connector); + + recalc |= fanotify_mark_update_flags(fsn_mark, fan_flags); spin_unlock(&fsn_mark->lock); - return mask & ~oldmask; + return recalc; +} + +struct fan_fsid { + struct super_block *sb; + __kernel_fsid_t id; + bool weak; +}; + +static int fanotify_set_mark_fsid(struct fsnotify_group *group, + struct fsnotify_mark *mark, + struct fan_fsid *fsid) +{ + struct fsnotify_mark_connector *conn; + struct fsnotify_mark *old; + struct super_block *old_sb = NULL; + + FANOTIFY_MARK(mark)->fsid = fsid->id; + mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID; + if (fsid->weak) + mark->flags |= FSNOTIFY_MARK_FLAG_WEAK_FSID; + + /* First mark added will determine if group is single or multi fsid */ + if (list_empty(&group->marks_list)) + return 0; + + /* Find sb of an existing mark */ + list_for_each_entry(old, &group->marks_list, g_list) { + conn = READ_ONCE(old->connector); + if (!conn) + continue; + old_sb = fsnotify_connector_sb(conn); + if (old_sb) + break; + } + + /* Only detached marks left? */ + if (!old_sb) + return 0; + + /* Do not allow mixing of marks with weak and strong fsid */ + if ((mark->flags ^ old->flags) & FSNOTIFY_MARK_FLAG_WEAK_FSID) + return -EXDEV; + + /* Allow mixing of marks with strong fsid from different fs */ + if (!fsid->weak) + return 0; + + /* Do not allow mixing marks with weak fsid from different fs */ + if (old_sb != fsid->sb) + return -EXDEV; + + /* Do not allow mixing marks from different btrfs sub-volumes */ + if (!fanotify_fsid_equal(&FANOTIFY_MARK(old)->fsid, + &FANOTIFY_MARK(mark)->fsid)) + return -EXDEV; + + return 0; } static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, - fsnotify_connp_t *connp, - unsigned int type, - __kernel_fsid_t *fsid) + void *obj, + unsigned int obj_type, + unsigned int fan_flags, + struct fan_fsid *fsid) { + struct ucounts *ucounts = group->fanotify_data.ucounts; + struct fanotify_mark *fan_mark; struct fsnotify_mark *mark; int ret; - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) + /* + * Enforce per user marks limits per user in all containing user ns. + * A group with FAN_UNLIMITED_MARKS does not contribute to mark count + * in the limited groups account. + */ + BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_MARKS)); + if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) && + !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS)) return ERR_PTR(-ENOSPC); - mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); - if (!mark) - return ERR_PTR(-ENOMEM); + fan_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); + if (!fan_mark) { + ret = -ENOMEM; + goto out_dec_ucounts; + } + mark = &fan_mark->fsn_mark; fsnotify_init_mark(mark, group); - ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid); - if (ret) { - fsnotify_put_mark(mark); - return ERR_PTR(ret); + if (fan_flags & FAN_MARK_EVICTABLE) + mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF; + + /* Cache fsid of filesystem containing the marked object */ + if (fsid) { + ret = fanotify_set_mark_fsid(group, mark, fsid); + if (ret) + goto out_put_mark; + } else { + fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0; } + ret = fsnotify_add_mark_locked(mark, obj, obj_type, 0); + if (ret) + goto out_put_mark; + return mark; + +out_put_mark: + fsnotify_put_mark(mark); +out_dec_ucounts: + if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS)) + dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS); + return ERR_PTR(ret); } +static int fanotify_group_init_error_pool(struct fsnotify_group *group) +{ + if (mempool_initialized(&group->fanotify_data.error_events_pool)) + return 0; + + return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool, + FANOTIFY_DEFAULT_FEE_POOL_SIZE, + sizeof(struct fanotify_error_event)); +} + +static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark, + __u32 mask, unsigned int fan_flags) +{ + /* + * Non evictable mark cannot be downgraded to evictable mark. + */ + if (fan_flags & FAN_MARK_EVICTABLE && + !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) + return -EEXIST; + + /* + * New ignore mask semantics cannot be downgraded to old semantics. + */ + if (fan_flags & FAN_MARK_IGNORED_MASK && + fsn_mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) + return -EEXIST; + + /* + * An ignore mask that survives modify could never be downgraded to not + * survive modify. With new FAN_MARK_IGNORE semantics we make that rule + * explicit and return an error when trying to update the ignore mask + * without the original FAN_MARK_IGNORED_SURV_MODIFY value. + */ + if (fan_flags & FAN_MARK_IGNORE && + !(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && + fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) + return -EEXIST; + + /* For now pre-content events are not generated for directories */ + mask |= fsn_mark->mask; + if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR) + return -EEXIST; + + return 0; +} static int fanotify_add_mark(struct fsnotify_group *group, - fsnotify_connp_t *connp, unsigned int type, - __u32 mask, unsigned int flags, - __kernel_fsid_t *fsid) + void *obj, unsigned int obj_type, + __u32 mask, unsigned int fan_flags, + struct fan_fsid *fsid) { struct fsnotify_mark *fsn_mark; - __u32 added; + bool recalc; + int ret = 0; - mutex_lock(&group->mark_mutex); - fsn_mark = fsnotify_find_mark(connp, group); + fsnotify_group_lock(group); + fsn_mark = fsnotify_find_mark(obj, obj_type, group); if (!fsn_mark) { - fsn_mark = fanotify_add_new_mark(group, connp, type, fsid); + fsn_mark = fanotify_add_new_mark(group, obj, obj_type, + fan_flags, fsid); if (IS_ERR(fsn_mark)) { - mutex_unlock(&group->mark_mutex); + fsnotify_group_unlock(group); return PTR_ERR(fsn_mark); } } - added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); - if (added & ~fsnotify_conn_mask(fsn_mark->connector)) + + /* + * Check if requested mark flags conflict with an existing mark flags. + */ + ret = fanotify_may_update_existing_mark(fsn_mark, mask, fan_flags); + if (ret) + goto out; + + /* + * Error events are pre-allocated per group, only if strictly + * needed (i.e. FAN_FS_ERROR was requested). + */ + if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS) && + (mask & FAN_FS_ERROR)) { + ret = fanotify_group_init_error_pool(group); + if (ret) + goto out; + } + + recalc = fanotify_mark_add_to_mask(fsn_mark, mask, fan_flags); + if (recalc) fsnotify_recalc_mask(fsn_mark->connector); - mutex_unlock(&group->mark_mutex); + +out: + fsnotify_group_unlock(group); fsnotify_put_mark(fsn_mark); - return 0; -} -static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, - struct vfsmount *mnt, __u32 mask, - unsigned int flags, __kernel_fsid_t *fsid) -{ - return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid); + if (!ret && (mask & FANOTIFY_PERM_EVENTS)) + fanotify_perm_watchdog_group_add(group); + + return ret; } -static int fanotify_add_sb_mark(struct fsnotify_group *group, - struct super_block *sb, __u32 mask, - unsigned int flags, __kernel_fsid_t *fsid) +static struct fsnotify_event *fanotify_alloc_overflow_event(void) { - return fanotify_add_mark(group, &sb->s_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid); + struct fanotify_event *oevent; + + oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT); + if (!oevent) + return NULL; + + fanotify_init_event(oevent, 0, FS_Q_OVERFLOW); + oevent->type = FANOTIFY_EVENT_TYPE_OVERFLOW; + + return &oevent->fse; } -static int fanotify_add_inode_mark(struct fsnotify_group *group, - struct inode *inode, __u32 mask, - unsigned int flags, __kernel_fsid_t *fsid) +static struct hlist_head *fanotify_alloc_merge_hash(void) { - pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); + struct hlist_head *hash; - /* - * If some other task has this inode open for write we should not add - * an ignored mark, unless that ignored mark is supposed to survive - * modification changes anyway. - */ - if ((flags & FAN_MARK_IGNORED_MASK) && - !(flags & FAN_MARK_IGNORED_SURV_MODIFY) && - inode_is_open_for_write(inode)) - return 0; + hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS, + GFP_KERNEL_ACCOUNT); + if (!hash) + return NULL; + + __hash_init(hash, FANOTIFY_HTABLE_SIZE); - return fanotify_add_mark(group, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid); + return hash; } +DEFINE_CLASS(fsnotify_group, + struct fsnotify_group *, + if (!IS_ERR_OR_NULL(_T)) fsnotify_destroy_group(_T), + fsnotify_alloc_group(ops, flags), + const struct fsnotify_ops *ops, int flags) + /* fanotify syscalls */ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) { - struct fsnotify_group *group; + struct user_namespace *user_ns = current_user_ns(); int f_flags, fd; - struct user_struct *user; - struct fanotify_event *oevent; + unsigned int fid_mode = flags & FANOTIFY_FID_BITS; + unsigned int class = flags & FANOTIFY_CLASS_BITS; + unsigned int internal_flags = 0; pr_debug("%s: flags=%x event_f_flags=%x\n", __func__, flags, event_f_flags); - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + if (!capable(CAP_SYS_ADMIN)) { + /* + * An unprivileged user can setup an fanotify group with + * limited functionality - an unprivileged group is limited to + * notification events with file handles or mount ids and it + * cannot use unlimited queue/marks. + */ + if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || + !(flags & (FANOTIFY_FID_BITS | FAN_REPORT_MNT))) + return -EPERM; + + /* + * Setting the internal flag FANOTIFY_UNPRIV on the group + * prevents setting mount/filesystem marks on this group and + * prevents reporting pid and open fd in events. + */ + internal_flags |= FANOTIFY_UNPRIV; + } #ifdef CONFIG_AUDITSYSCALL if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT)) @@ -852,6 +1641,22 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) #endif return -EINVAL; + /* + * A pidfd can only be returned for a thread-group leader; thus + * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually + * exclusive. + */ + if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID)) + return -EINVAL; + + /* Don't allow mixing mnt events with inode events for now */ + if (flags & FAN_REPORT_MNT) { + if (class != FAN_CLASS_NOTIF) + return -EINVAL; + if (flags & (FANOTIFY_FID_BITS | FAN_REPORT_FD_ERROR)) + return -EINVAL; + } + if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) return -EINVAL; @@ -864,141 +1669,184 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) return -EINVAL; } - if ((flags & FAN_REPORT_FID) && - (flags & FANOTIFY_CLASS_BITS) != FAN_CLASS_NOTIF) + if (fid_mode && class != FAN_CLASS_NOTIF) return -EINVAL; - user = get_current_user(); - if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) { - free_uid(user); - return -EMFILE; - } + /* + * Child name is reported with parent fid so requires dir fid. + * We can report both child fid and dir fid with or without name. + */ + if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID)) + return -EINVAL; - f_flags = O_RDWR | FMODE_NONOTIFY; + /* + * FAN_REPORT_TARGET_FID requires FAN_REPORT_NAME and FAN_REPORT_FID + * and is used as an indication to report both dir and child fid on all + * dirent events. + */ + if ((fid_mode & FAN_REPORT_TARGET_FID) && + (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID))) + return -EINVAL; + + f_flags = O_RDWR; if (flags & FAN_CLOEXEC) f_flags |= O_CLOEXEC; if (flags & FAN_NONBLOCK) f_flags |= O_NONBLOCK; + CLASS(fsnotify_group, group)(&fanotify_fsnotify_ops, + FSNOTIFY_GROUP_USER); /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ - group = fsnotify_alloc_group(&fanotify_fsnotify_ops); - if (IS_ERR(group)) { - free_uid(user); + if (IS_ERR(group)) return PTR_ERR(group); - } - group->fanotify_data.user = user; - group->fanotify_data.flags = flags; - atomic_inc(&user->fanotify_listeners); + /* Enforce groups limits per user in all containing user ns */ + group->fanotify_data.ucounts = inc_ucount(user_ns, current_euid(), + UCOUNT_FANOTIFY_GROUPS); + if (!group->fanotify_data.ucounts) + return -EMFILE; + + group->fanotify_data.flags = flags | internal_flags; group->memcg = get_mem_cgroup_from_mm(current->mm); + group->user_ns = get_user_ns(user_ns); - oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL, - FSNOTIFY_EVENT_NONE, NULL, NULL); - if (unlikely(!oevent)) { - fd = -ENOMEM; - goto out_destroy_group; - } - group->overflow_event = &oevent->fse; + group->fanotify_data.merge_hash = fanotify_alloc_merge_hash(); + if (!group->fanotify_data.merge_hash) + return -ENOMEM; + + group->overflow_event = fanotify_alloc_overflow_event(); + if (unlikely(!group->overflow_event)) + return -ENOMEM; if (force_o_largefile()) event_f_flags |= O_LARGEFILE; group->fanotify_data.f_flags = event_f_flags; init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); - switch (flags & FANOTIFY_CLASS_BITS) { + INIT_LIST_HEAD(&group->fanotify_data.perm_grp_list); + switch (class) { case FAN_CLASS_NOTIF: - group->priority = FS_PRIO_0; + group->priority = FSNOTIFY_PRIO_NORMAL; break; case FAN_CLASS_CONTENT: - group->priority = FS_PRIO_1; + group->priority = FSNOTIFY_PRIO_CONTENT; break; case FAN_CLASS_PRE_CONTENT: - group->priority = FS_PRIO_2; + group->priority = FSNOTIFY_PRIO_PRE_CONTENT; break; default: - fd = -EINVAL; - goto out_destroy_group; + return -EINVAL; } + BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_QUEUE)); if (flags & FAN_UNLIMITED_QUEUE) { - fd = -EPERM; - if (!capable(CAP_SYS_ADMIN)) - goto out_destroy_group; group->max_events = UINT_MAX; } else { - group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS; - } - - if (flags & FAN_UNLIMITED_MARKS) { - fd = -EPERM; - if (!capable(CAP_SYS_ADMIN)) - goto out_destroy_group; - group->fanotify_data.max_marks = UINT_MAX; - } else { - group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS; + group->max_events = fanotify_max_queued_events; } if (flags & FAN_ENABLE_AUDIT) { - fd = -EPERM; if (!capable(CAP_AUDIT_WRITE)) - goto out_destroy_group; + return -EPERM; } - fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags); - if (fd < 0) - goto out_destroy_group; - - return fd; - -out_destroy_group: - fsnotify_destroy_group(group); + fd = FD_ADD(f_flags, + anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, + group, f_flags, FMODE_NONOTIFY)); + if (fd >= 0) + retain_and_null_ptr(group); return fd; } -/* Check if filesystem can encode a unique fid */ -static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid) +static int fanotify_test_fsid(struct dentry *dentry, unsigned int flags, + struct fan_fsid *fsid) { + unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; __kernel_fsid_t root_fsid; int err; /* - * Make sure path is not in filesystem with zero fsid (e.g. tmpfs). + * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse). */ - err = vfs_get_fsid(path->dentry, fsid); + err = vfs_get_fsid(dentry, &fsid->id); if (err) return err; - if (!fsid->val[0] && !fsid->val[1]) - return -ENODEV; + fsid->sb = dentry->d_sb; + if (!fsid->id.val[0] && !fsid->id.val[1]) { + err = -ENODEV; + goto weak; + } /* - * Make sure path is not inside a filesystem subvolume (e.g. btrfs) + * Make sure dentry is not of a filesystem subvolume (e.g. btrfs) * which uses a different fsid than sb root. */ - err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid); + err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid); if (err) return err; - if (root_fsid.val[0] != fsid->val[0] || - root_fsid.val[1] != fsid->val[1]) - return -EXDEV; + if (!fanotify_fsid_equal(&root_fsid, &fsid->id)) { + err = -EXDEV; + goto weak; + } + + fsid->weak = false; + return 0; + +weak: + /* Allow weak fsid when marking inodes */ + fsid->weak = true; + return (mark_type == FAN_MARK_INODE) ? 0 : err; +} + +/* Check if filesystem can encode a unique fid */ +static int fanotify_test_fid(struct dentry *dentry, unsigned int flags) +{ + unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; + const struct export_operations *nop = dentry->d_sb->s_export_op; /* - * We need to make sure that the file system supports at least - * encoding a file handle so user can use name_to_handle_at() to - * compare fid returned with event to the file handle of watched - * objects. However, name_to_handle_at() requires that the - * filesystem also supports decoding file handles. + * We need to make sure that the filesystem supports encoding of + * file handles so user can use name_to_handle_at() to compare fids + * reported with events to the file handle of watched objects. */ - if (!path->dentry->d_sb->s_export_op || - !path->dentry->d_sb->s_export_op->fh_to_dentry) + if (!exportfs_can_encode_fid(nop)) + return -EOPNOTSUPP; + + /* + * For sb/mount mark, we also need to make sure that the filesystem + * supports decoding file handles, so user has a way to map back the + * reported fids to filesystem objects. + */ + if (mark_type != FAN_MARK_INODE && !exportfs_can_decode_fh(nop)) return -EOPNOTSUPP; return 0; } -static int fanotify_events_supported(struct path *path, __u64 mask) +static int fanotify_events_supported(struct fsnotify_group *group, + const struct path *path, __u64 mask, + unsigned int flags) { + unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; + bool is_dir = d_is_dir(path->dentry); + /* Strict validation of events in non-dir inode mask with v5.17+ APIs */ + bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) || + (mask & FAN_RENAME) || + (flags & FAN_MARK_IGNORE); + + /* + * Filesystems need to opt-into pre-content evnets (a.k.a HSM) + * and they are only supported on regular files and directories. + */ + if (mask & FANOTIFY_PRE_CONTENT_EVENTS) { + if (!(path->mnt->mnt_sb->s_iflags & SB_I_ALLOW_HSM)) + return -EOPNOTSUPP; + if (!is_dir && !d_is_reg(path->dentry)) + return -EINVAL; + } + /* * Some filesystems such as 'proc' acquire unusual locks when opening * files. For them fanotify permission events have high chances of @@ -1010,6 +1858,30 @@ static int fanotify_events_supported(struct path *path, __u64 mask) if (mask & FANOTIFY_PERM_EVENTS && path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM) return -EINVAL; + + /* + * mount and sb marks are not allowed on kernel internal pseudo fs, + * like pipe_mnt, because that would subscribe to events on all the + * anonynous pipes in the system. + * + * SB_NOUSER covers all of the internal pseudo fs whose objects are not + * exposed to user's mount namespace, but there are other SB_KERNMOUNT + * fs, like nsfs, debugfs, for which the value of allowing sb and mount + * mark is questionable. For now we leave them alone. + */ + if (mark_type != FAN_MARK_INODE && + path->mnt->mnt_sb->s_flags & SB_NOUSER) + return -EINVAL; + + /* + * We shouldn't have allowed setting dirent events and the directory + * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode, + * but because we always allowed it, error only when using new APIs. + */ + if (strict_dir_events && mark_type == FAN_MARK_INODE && + !is_dir && (mask & FANOTIFY_DIRONLY_EVENT_BITS)) + return -ENOTDIR; + return 0; } @@ -1017,21 +1889,25 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, int dfd, const char __user *pathname) { struct inode *inode = NULL; - struct vfsmount *mnt = NULL; struct fsnotify_group *group; - struct fd f; struct path path; - __kernel_fsid_t __fsid, *fsid = NULL; + struct fan_fsid __fsid, *fsid = NULL; + struct user_namespace *user_ns = NULL; + struct mnt_namespace *mntns; u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; - unsigned int obj_type; + unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS; + unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS; + unsigned int obj_type, fid_mode; + void *obj = NULL; + u32 umask = 0; int ret; pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n", __func__, fanotify_fd, flags, dfd, pathname, mask); /* we only use the lower 32 bits as of right now. */ - if (mask & ((__u64)0xffffffff << 32)) + if (upper_32_bits(mask)) return -EINVAL; if (flags & ~FANOTIFY_MARK_FLAGS) @@ -1047,12 +1923,15 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, case FAN_MARK_FILESYSTEM: obj_type = FSNOTIFY_OBJ_TYPE_SB; break; + case FAN_MARK_MNTNS: + obj_type = FSNOTIFY_OBJ_TYPE_MNTNS; + break; default: return -EINVAL; } - switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { - case FAN_MARK_ADD: /* fallthrough */ + switch (mark_cmd) { + case FAN_MARK_ADD: case FAN_MARK_REMOVE: if (!mask) return -EINVAL; @@ -1071,96 +1950,197 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (mask & ~valid_mask) return -EINVAL; - f = fdget(fanotify_fd); - if (unlikely(!f.file)) + + /* We don't allow FAN_MARK_IGNORE & FAN_MARK_IGNORED_MASK together */ + if (ignore == (FAN_MARK_IGNORE | FAN_MARK_IGNORED_MASK)) + return -EINVAL; + + /* + * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with + * FAN_MARK_IGNORED_MASK. + */ + if (ignore == FAN_MARK_IGNORED_MASK) { + mask &= ~FANOTIFY_EVENT_FLAGS; + umask = FANOTIFY_EVENT_FLAGS; + } + + CLASS(fd, f)(fanotify_fd); + if (fd_empty(f)) return -EBADF; /* verify that this is indeed an fanotify instance */ - ret = -EINVAL; - if (unlikely(f.file->f_op != &fanotify_fops)) - goto fput_and_out; - group = f.file->private_data; + if (unlikely(fd_file(f)->f_op != &fanotify_fops)) + return -EINVAL; + group = fd_file(f)->private_data; + + /* Only report mount events on mnt namespace */ + if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) { + if (mask & ~FANOTIFY_MOUNT_EVENTS) + return -EINVAL; + if (mark_type != FAN_MARK_MNTNS) + return -EINVAL; + } else { + if (mask & FANOTIFY_MOUNT_EVENTS) + return -EINVAL; + if (mark_type == FAN_MARK_MNTNS) + return -EINVAL; + } /* - * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not - * allowed to set permissions events. + * A user is allowed to setup sb/mount/mntns marks only if it is + * capable in the user ns where the group was created. + */ + if (!ns_capable(group->user_ns, CAP_SYS_ADMIN) && + mark_type != FAN_MARK_INODE) + return -EPERM; + + /* + * Permission events are not allowed for FAN_CLASS_NOTIF. + * Pre-content permission events are not allowed for FAN_CLASS_CONTENT. */ - ret = -EINVAL; if (mask & FANOTIFY_PERM_EVENTS && - group->priority == FS_PRIO_0) - goto fput_and_out; + group->priority == FSNOTIFY_PRIO_NORMAL) + return -EINVAL; + else if (mask & FANOTIFY_PRE_CONTENT_EVENTS && + group->priority == FSNOTIFY_PRIO_CONTENT) + return -EINVAL; + + if (mask & FAN_FS_ERROR && + mark_type != FAN_MARK_FILESYSTEM) + return -EINVAL; /* - * Events with data type inode do not carry enough information to report - * event->fd, so we do not allow setting a mask for inode events unless - * group supports reporting fid. - * inode events are not supported on a mount mark, because they do not - * carry enough information (i.e. path) to be filtered by mount point. + * Evictable is only relevant for inode marks, because only inode object + * can be evicted on memory pressure. */ - if (mask & FANOTIFY_INODE_EVENTS && - (!FAN_GROUP_FLAG(group, FAN_REPORT_FID) || - mark_type == FAN_MARK_MOUNT)) - goto fput_and_out; + if (flags & FAN_MARK_EVICTABLE && + mark_type != FAN_MARK_INODE) + return -EINVAL; - if (flags & FAN_MARK_FLUSH) { - ret = 0; - if (mark_type == FAN_MARK_MOUNT) - fsnotify_clear_vfsmount_marks_by_group(group); - else if (mark_type == FAN_MARK_FILESYSTEM) - fsnotify_clear_sb_marks_by_group(group); - else - fsnotify_clear_inode_marks_by_group(group); - goto fput_and_out; + /* + * Events that do not carry enough information to report + * event->fd require a group that supports reporting fid. Those + * events are not supported on a mount mark, because they do not + * carry enough information (i.e. path) to be filtered by mount + * point. + */ + fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); + if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_MOUNT_EVENTS|FANOTIFY_EVENT_FLAGS) && + (!fid_mode || mark_type == FAN_MARK_MOUNT)) + return -EINVAL; + + /* + * FAN_RENAME uses special info type records to report the old and + * new parent+name. Reporting only old and new parent id is less + * useful and was not implemented. + */ + if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME)) + return -EINVAL; + + /* Pre-content events are not currently generated for directories. */ + if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR) + return -EINVAL; + + if (mark_cmd == FAN_MARK_FLUSH) { + fsnotify_clear_marks_by_group(group, obj_type); + return 0; } ret = fanotify_find_path(dfd, pathname, &path, flags, (mask & ALL_FSNOTIFY_EVENTS), obj_type); if (ret) - goto fput_and_out; + return ret; - if (flags & FAN_MARK_ADD) { - ret = fanotify_events_supported(&path, mask); + if (mark_cmd == FAN_MARK_ADD) { + ret = fanotify_events_supported(group, &path, mask, flags); if (ret) goto path_put_and_out; } - if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { - ret = fanotify_test_fid(&path, &__fsid); + if (fid_mode) { + ret = fanotify_test_fsid(path.dentry, flags, &__fsid); + if (ret) + goto path_put_and_out; + + ret = fanotify_test_fid(path.dentry, flags); if (ret) goto path_put_and_out; fsid = &__fsid; } - /* inode held in place by reference to path; group by fget on fd */ - if (mark_type == FAN_MARK_INODE) + /* + * In addition to being capable in the user ns where group was created, + * the user also needs to be capable in the user ns associated with + * the filesystem or in the user ns associated with the mntns + * (when marking mntns). + */ + if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) { inode = path.dentry->d_inode; - else - mnt = path.mnt; + obj = inode; + } else if (obj_type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { + user_ns = path.mnt->mnt_sb->s_user_ns; + obj = path.mnt; + } else if (obj_type == FSNOTIFY_OBJ_TYPE_SB) { + user_ns = path.mnt->mnt_sb->s_user_ns; + obj = path.mnt->mnt_sb; + } else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) { + ret = -EINVAL; + mntns = mnt_ns_from_dentry(path.dentry); + if (!mntns) + goto path_put_and_out; + user_ns = mntns->user_ns; + obj = mntns; + } + + ret = -EPERM; + if (user_ns && !ns_capable(user_ns, CAP_SYS_ADMIN)) + goto path_put_and_out; + + ret = -EINVAL; + if (!obj) + goto path_put_and_out; + + /* + * If some other task has this inode open for write we should not add + * an ignore mask, unless that ignore mask is supposed to survive + * modification changes anyway. + */ + if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) && + !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) { + ret = !inode ? -EINVAL : -EISDIR; + /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */ + if (ignore == FAN_MARK_IGNORE && + (!inode || S_ISDIR(inode->i_mode))) + goto path_put_and_out; + + ret = 0; + if (inode && inode_is_open_for_write(inode)) + goto path_put_and_out; + } + + /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ + if (!inode || !S_ISDIR(inode->i_mode)) { + mask &= ~FAN_EVENT_ON_CHILD; + umask = FAN_EVENT_ON_CHILD; + /* + * If group needs to report parent fid, register for getting + * events with parent/name info for non-directory. + */ + if ((fid_mode & FAN_REPORT_DIR_FID) && + (flags & FAN_MARK_ADD) && !ignore) + mask |= FAN_EVENT_ON_CHILD; + } /* create/update an inode mark */ - switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) { + switch (mark_cmd) { case FAN_MARK_ADD: - if (mark_type == FAN_MARK_MOUNT) - ret = fanotify_add_vfsmount_mark(group, mnt, mask, - flags, fsid); - else if (mark_type == FAN_MARK_FILESYSTEM) - ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask, - flags, fsid); - else - ret = fanotify_add_inode_mark(group, inode, mask, - flags, fsid); + ret = fanotify_add_mark(group, obj, obj_type, mask, flags, + fsid); break; case FAN_MARK_REMOVE: - if (mark_type == FAN_MARK_MOUNT) - ret = fanotify_remove_vfsmount_mark(group, mnt, mask, - flags); - else if (mark_type == FAN_MARK_FILESYSTEM) - ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask, - flags); - else - ret = fanotify_remove_inode_mark(group, inode, mask, - flags); + ret = fanotify_remove_mark(group, obj, obj_type, mask, flags, + umask); break; default: ret = -EINVAL; @@ -1168,31 +2148,26 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, path_put_and_out: path_put(&path); -fput_and_out: - fdput(f); return ret; } +#ifndef CONFIG_ARCH_SPLIT_ARG64 SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, __u64, mask, int, dfd, const char __user *, pathname) { return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname); } +#endif -#ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE6(fanotify_mark, +#if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT) +SYSCALL32_DEFINE6(fanotify_mark, int, fanotify_fd, unsigned int, flags, - __u32, mask0, __u32, mask1, int, dfd, + SC_ARG64(mask), int, dfd, const char __user *, pathname) { - return do_fanotify_mark(fanotify_fd, flags, -#ifdef __BIG_ENDIAN - ((__u64)mask0 << 32) | mask1, -#else - ((__u64)mask1 << 32) | mask0, -#endif - dfd, pathname); + return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask), + dfd, pathname); } #endif @@ -1203,10 +2178,26 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, */ static int __init fanotify_user_setup(void) { - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 8); - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); + struct sysinfo si; + int max_marks; + + si_meminfo(&si); + /* + * Allow up to 1% of addressable memory to be accounted for per user + * marks limited to the range [8192, 1048576]. mount and sb marks are + * a lot cheaper than inode marks, but there is no reason for a user + * to have many of those, so calculate by the cost of inode marks. + */ + max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) / + INODE_MARK_COST; + max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS, + FANOTIFY_DEFAULT_MAX_USER_MARKS); - fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, + BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 14); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11); + + fanotify_mark_cache = KMEM_CACHE(fanotify_mark, SLAB_PANIC|SLAB_ACCOUNT); fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event, SLAB_PANIC); @@ -1216,6 +2207,13 @@ static int __init fanotify_user_setup(void) fanotify_perm_event_cachep = KMEM_CACHE(fanotify_perm_event, SLAB_PANIC); } + fanotify_mnt_event_cachep = KMEM_CACHE(fanotify_mnt_event, SLAB_PANIC); + + fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS; + init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] = + FANOTIFY_DEFAULT_MAX_GROUPS; + init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks; + fanotify_sysctls_init(); return 0; } |
