diff options
Diffstat (limited to 'fs')
47 files changed, 645 insertions, 330 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 080e2ebb8aa0..f45b61fe9a9a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3516,7 +3516,7 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device) struct bio *bio = device->flush_bio; if (!device->flush_bio_sent) - return 0; + return BLK_STS_OK; device->flush_bio_sent = 0; wait_for_completion_io(&device->flush_wait); @@ -3563,7 +3563,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info) continue; write_dev_flush(dev); - dev->last_flush_error = 0; + dev->last_flush_error = BLK_STS_OK; } /* wait for all the barriers */ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 95c212037095..24bcd5cd9cf2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7924,11 +7924,12 @@ err: return ret; } -static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio, - int mirror_num) +static inline blk_status_t submit_dio_repair_bio(struct inode *inode, + struct bio *bio, + int mirror_num) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int ret; + blk_status_t ret; BUG_ON(bio_op(bio) == REQ_OP_WRITE); @@ -7980,10 +7981,10 @@ static int btrfs_check_dio_repairable(struct inode *inode, return 1; } -static int dio_read_error(struct inode *inode, struct bio *failed_bio, - struct page *page, unsigned int pgoff, - u64 start, u64 end, int failed_mirror, - bio_end_io_t *repair_endio, void *repair_arg) +static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio, + struct page *page, unsigned int pgoff, + u64 start, u64 end, int failed_mirror, + bio_end_io_t *repair_endio, void *repair_arg) { struct io_failure_record *failrec; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -7993,18 +7994,19 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, int read_mode = 0; int segs; int ret; + blk_status_t status; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); ret = btrfs_get_io_failure_record(inode, start, end, &failrec); if (ret) - return ret; + return errno_to_blk_status(ret); ret = btrfs_check_dio_repairable(inode, failed_bio, failrec, failed_mirror); if (!ret) { free_io_failure(failure_tree, io_tree, failrec); - return -EIO; + return BLK_STS_IOERR; } segs = bio_segments(failed_bio); @@ -8022,13 +8024,13 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n", read_mode, failrec->this_mirror, failrec->in_validation); - ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror); - if (ret) { + status = submit_dio_repair_bio(inode, bio, failrec->this_mirror); + if (status) { free_io_failure(failure_tree, io_tree, failrec); bio_put(bio); } - return ret; + return status; } struct btrfs_retry_complete { @@ -8065,8 +8067,8 @@ end: bio_put(bio); } -static int __btrfs_correct_data_nocsum(struct inode *inode, - struct btrfs_io_bio *io_bio) +static blk_status_t __btrfs_correct_data_nocsum(struct inode *inode, + struct btrfs_io_bio *io_bio) { struct btrfs_fs_info *fs_info; struct bio_vec bvec; @@ -8076,8 +8078,8 @@ static int __btrfs_correct_data_nocsum(struct inode *inode, unsigned int pgoff; u32 sectorsize; int nr_sectors; - int ret; - int err = 0; + blk_status_t ret; + blk_status_t err = BLK_STS_OK; fs_info = BTRFS_I(inode)->root->fs_info; sectorsize = fs_info->sectorsize; @@ -8183,11 +8185,12 @@ static blk_status_t __btrfs_subio_endio_read(struct inode *inode, int csum_pos; bool uptodate = (err == 0); int ret; + blk_status_t status; fs_info = BTRFS_I(inode)->root->fs_info; sectorsize = fs_info->sectorsize; - err = 0; + err = BLK_STS_OK; start = io_bio->logical; done.inode = inode; io_bio->bio.bi_iter = io_bio->iter; @@ -8209,12 +8212,12 @@ try_again: done.start = start; init_completion(&done.done); - ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page, - pgoff, start, start + sectorsize - 1, - io_bio->mirror_num, - btrfs_retry_endio, &done); - if (ret) { - err = errno_to_blk_status(ret); + status = dio_read_error(inode, &io_bio->bio, bvec.bv_page, + pgoff, start, start + sectorsize - 1, + io_bio->mirror_num, btrfs_retry_endio, + &done); + if (status) { + err = status; goto next; } @@ -8250,7 +8253,7 @@ static blk_status_t btrfs_subio_endio_read(struct inode *inode, if (unlikely(err)) return __btrfs_correct_data_nocsum(inode, io_bio); else - return 0; + return BLK_STS_OK; } else { return __btrfs_subio_endio_read(inode, io_bio, err); } @@ -8423,9 +8426,9 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, return 0; } -static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, - u64 file_offset, int skip_sum, - int async_submit) +static inline blk_status_t +__btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset, + int skip_sum, int async_submit) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; @@ -8488,6 +8491,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, int clone_offset = 0; int clone_len; int ret; + blk_status_t status; map_length = orig_bio->bi_iter.bi_size; submit_len = map_length; @@ -8537,9 +8541,9 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, */ atomic_inc(&dip->pending_bios); - ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, - async_submit); - if (ret) { + status = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, + async_submit); + if (status) { bio_put(bio); atomic_dec(&dip->pending_bios); goto out_err; @@ -8557,9 +8561,9 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, } while (submit_len > 0); submit: - ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, - async_submit); - if (!ret) + status = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, + async_submit); + if (!status) return 0; bio_put(bio); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 208638384cd2..2cf6ba40f7c4 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -905,7 +905,7 @@ static void raid_write_end_io(struct bio *bio) if (!atomic_dec_and_test(&rbio->stripes_pending)) return; - err = 0; + err = BLK_STS_OK; /* OK, we have read all the stripes we need to. */ max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? @@ -1324,7 +1324,7 @@ write_data: return; cleanup: - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); } /* @@ -1475,7 +1475,7 @@ static void raid_rmw_end_io(struct bio *bio) cleanup: - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); } static void async_rmw_stripe(struct btrfs_raid_bio *rbio) @@ -1579,7 +1579,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) return 0; cleanup: - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); return -EIO; finish: @@ -1795,12 +1795,12 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) void **pointers; int faila = -1, failb = -1; struct page *page; - int err; + blk_status_t err; int i; pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); if (!pointers) { - err = -ENOMEM; + err = BLK_STS_RESOURCE; goto cleanup_io; } @@ -1856,7 +1856,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) * a bad data or Q stripe. * TODO, we should redo the xor here. */ - err = -EIO; + err = BLK_STS_IOERR; goto cleanup; } /* @@ -1882,7 +1882,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) { if (rbio->bbio->raid_map[faila] == RAID5_P_STRIPE) { - err = -EIO; + err = BLK_STS_IOERR; goto cleanup; } /* @@ -1954,13 +1954,13 @@ pstripe: } } - err = 0; + err = BLK_STS_OK; cleanup: kfree(pointers); cleanup_io: if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { - if (err == 0) + if (err == BLK_STS_OK) cache_rbio_pages(rbio); else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); @@ -1968,7 +1968,7 @@ cleanup_io: rbio_orig_end_io(rbio, err); } else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { rbio_orig_end_io(rbio, err); - } else if (err == 0) { + } else if (err == BLK_STS_OK) { rbio->faila = -1; rbio->failb = -1; @@ -2005,7 +2005,7 @@ static void raid_recover_end_io(struct bio *bio) return; if (atomic_read(&rbio->error) > rbio->bbio->max_errors) - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); else __raid_recover_end_io(rbio); } @@ -2104,7 +2104,7 @@ out: cleanup: if (rbio->operation == BTRFS_RBIO_READ_REBUILD || rbio->operation == BTRFS_RBIO_REBUILD_MISSING) - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); return -EIO; } @@ -2431,7 +2431,7 @@ submit_write: nr_data = bio_list_size(&bio_list); if (!nr_data) { /* Every parity is right */ - rbio_orig_end_io(rbio, 0); + rbio_orig_end_io(rbio, BLK_STS_OK); return; } @@ -2451,7 +2451,7 @@ submit_write: return; cleanup: - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); } static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) @@ -2519,7 +2519,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) return; cleanup: - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); } /* @@ -2633,7 +2633,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) return; cleanup: - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); return; finish: diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e8b9a269fdde..bd679bc7a1a9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6212,8 +6212,8 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) } } -int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num, int async_submit) +blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, + int mirror_num, int async_submit) { struct btrfs_device *dev; struct bio *first_bio = bio; @@ -6233,7 +6233,7 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, &map_length, &bbio, mirror_num, 1); if (ret) { btrfs_bio_counter_dec(fs_info); - return ret; + return errno_to_blk_status(ret); } total_devs = bbio->num_stripes; @@ -6256,7 +6256,7 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, } btrfs_bio_counter_dec(fs_info); - return ret; + return errno_to_blk_status(ret); } if (map_length < length) { @@ -6283,7 +6283,7 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, dev_nr, async_submit); } btrfs_bio_counter_dec(fs_info); - return 0; + return BLK_STS_OK; } struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6f45fd60d15a..93277fc60930 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -74,7 +74,7 @@ struct btrfs_device { int missing; int can_discard; int is_tgtdev_for_dev_replace; - int last_flush_error; + blk_status_t last_flush_error; int flush_bio_sent; #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED @@ -416,8 +416,8 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 type); void btrfs_mapping_init(struct btrfs_mapping_tree *tree); void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); -int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num, int async_submit); +blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, + int mirror_num, int async_submit); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 50836280a6f8..1bc709fe330a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -189,7 +189,7 @@ static int ceph_releasepage(struct page *page, gfp_t g) /* * read a single page, without unlocking it. */ -static int readpage_nounlock(struct file *filp, struct page *page) +static int ceph_do_readpage(struct file *filp, struct page *page) { struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); @@ -219,7 +219,7 @@ static int readpage_nounlock(struct file *filp, struct page *page) err = ceph_readpage_from_fscache(inode, page); if (err == 0) - goto out; + return -EINPROGRESS; dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); @@ -249,8 +249,11 @@ out: static int ceph_readpage(struct file *filp, struct page *page) { - int r = readpage_nounlock(filp, page); - unlock_page(page); + int r = ceph_do_readpage(filp, page); + if (r != -EINPROGRESS) + unlock_page(page); + else + r = 0; return r; } @@ -1237,7 +1240,7 @@ retry_locked: goto retry_locked; r = writepage_nounlock(page, NULL); if (r < 0) - goto fail_nosnap; + goto fail_unlock; goto retry_locked; } @@ -1265,11 +1268,14 @@ retry_locked: } /* we need to read it. */ - r = readpage_nounlock(file, page); - if (r < 0) - goto fail_nosnap; + r = ceph_do_readpage(file, page); + if (r < 0) { + if (r == -EINPROGRESS) + return -EAGAIN; + goto fail_unlock; + } goto retry_locked; -fail_nosnap: +fail_unlock: unlock_page(page); return r; } diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index fd1172823f86..337f88673ed9 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -297,13 +297,7 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) } } -static void ceph_vfs_readpage_complete(struct page *page, void *data, int error) -{ - if (!error) - SetPageUptodate(page); -} - -static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error) +static void ceph_readpage_from_fscache_complete(struct page *page, void *data, int error) { if (!error) SetPageUptodate(page); @@ -331,7 +325,7 @@ int ceph_readpage_from_fscache(struct inode *inode, struct page *page) return -ENOBUFS; ret = fscache_read_or_alloc_page(ci->fscache, page, - ceph_vfs_readpage_complete, NULL, + ceph_readpage_from_fscache_complete, NULL, GFP_KERNEL); switch (ret) { @@ -360,7 +354,7 @@ int ceph_readpages_from_fscache(struct inode *inode, return -ENOBUFS; ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages, - ceph_vfs_readpage_complete_unlock, + ceph_readpage_from_fscache_complete, NULL, mapping_gfp_mask(mapping)); switch (ret) { diff --git a/fs/char_dev.c b/fs/char_dev.c index fb8507f521b2..ebcc8fb3fa66 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -28,6 +28,8 @@ static struct kobj_map *cdev_map; static DEFINE_MUTEX(chrdevs_lock); +#define CHRDEV_MAJOR_HASH_SIZE 255 + static struct char_device_struct { struct char_device_struct *next; unsigned int major; @@ -49,16 +51,39 @@ void chrdev_show(struct seq_file *f, off_t offset) { struct char_device_struct *cd; - if (offset < CHRDEV_MAJOR_HASH_SIZE) { - mutex_lock(&chrdevs_lock); - for (cd = chrdevs[offset]; cd; cd = cd->next) + mutex_lock(&chrdevs_lock); + for (cd = chrdevs[major_to_index(offset)]; cd; cd = cd->next) { + if (cd->major == offset) seq_printf(f, "%3d %s\n", cd->major, cd->name); - mutex_unlock(&chrdevs_lock); } + mutex_unlock(&chrdevs_lock); } #endif /* CONFIG_PROC_FS */ +static int find_dynamic_major(void) +{ + int i; + struct char_device_struct *cd; + + for (i = ARRAY_SIZE(chrdevs)-1; i > CHRDEV_MAJOR_DYN_END; i--) { + if (chrdevs[i] == NULL) + return i; + } + + for (i = CHRDEV_MAJOR_DYN_EXT_START; + i > CHRDEV_MAJOR_DYN_EXT_END; i--) { + for (cd = chrdevs[major_to_index(i)]; cd; cd = cd->next) + if (cd->major == i) + break; + + if (cd == NULL || cd->major != i) + return i; + } + + return -EBUSY; +} + /* * Register a single major with a specified minor range. * @@ -84,22 +109,21 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, mutex_lock(&chrdevs_lock); - /* temporary */ if (major == 0) { - for (i = ARRAY_SIZE(chrdevs)-1; i > 0; i--) { - if (chrdevs[i] == NULL) - break; - } - - if (i < CHRDEV_MAJOR_DYN_END) - pr_warn("CHRDEV \"%s\" major number %d goes below the dynamic allocation range\n", - name, i); - - if (i == 0) { - ret = -EBUSY; + ret = find_dynamic_major(); + if (ret < 0) { + pr_err("CHRDEV \"%s\" dynamic allocation region is full\n", + name); goto out; } - major = i; + major = ret; + } + + if (major >= CHRDEV_MAJOR_MAX) { + pr_err("CHRDEV \"%s\" major requested (%d) is greater than the maximum (%d)\n", + name, major, CHRDEV_MAJOR_MAX); + ret = -EINVAL; + goto out; } cd->major = major; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 59647eb72c5f..83a8f52cd879 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1223,6 +1223,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, char *tmp_end, *value; char delim; bool got_ip = false; + bool got_version = false; unsigned short port = 0; struct sockaddr *dstaddr = (struct sockaddr *)&vol->dstaddr; @@ -1874,24 +1875,35 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, pr_warn("CIFS: server netbiosname longer than 15 truncated.\n"); break; case Opt_ver: + /* version of mount userspace tools, not dialect */ string = match_strdup(args); if (string == NULL) goto out_nomem; + /* If interface changes in mount.cifs bump to new ver */ if (strncasecmp(string, "1", 1) == 0) { + if (strlen(string) > 1) { + pr_warn("Bad mount helper ver=%s. Did " + "you want SMB1 (CIFS) dialect " + "and mean to type vers=1.0 " + "instead?\n", string); + goto cifs_parse_mount_err; + } /* This is the default */ break; } /* For all other value, error */ - pr_warn("CIFS: Invalid version specified\n"); + pr_warn("CIFS: Invalid mount helper version specified\n"); goto cifs_parse_mount_err; case Opt_vers: + /* protocol version (dialect) */ string = match_strdup(args); if (string == NULL) goto out_nomem; if (cifs_parse_smb_version(string, vol) != 0) goto cifs_parse_mount_err; + got_version = true; break; case Opt_sec: string = match_strdup(args); @@ -1973,6 +1985,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, else if (override_gid == 1) pr_notice("CIFS: ignoring forcegid mount option specified with no gid= option.\n"); + if (got_version == false) + pr_warn("No dialect specified on mount. Default has changed to " + "a more secure dialect, SMB3 (vers=3.0), from CIFS " + "(SMB1). To use the less secure SMB1 dialect to access " + "old servers which do not support SMB3 specify vers=1.0" + " on mount. For somewhat newer servers such as Windows " + "7 try vers=2.1.\n"); + kfree(mountdata_copy); return 0; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 56366e984076..e702d48bd023 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -194,15 +194,20 @@ cifs_bp_rename_retry: } /* + * Don't allow path components longer than the server max. * Don't allow the separator character in a path component. * The VFS will not allow "/", but "\" is allowed by posix. */ static int -check_name(struct dentry *direntry) +check_name(struct dentry *direntry, struct cifs_tcon *tcon) { struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); int i; + if (unlikely(direntry->d_name.len > + le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength))) + return -ENAMETOOLONG; + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) { for (i = 0; i < direntry->d_name.len; i++) { if (direntry->d_name.name[i] == '\\') { @@ -500,10 +505,6 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, return finish_no_open(file, res); } - rc = check_name(direntry); - if (rc) - return rc; - xid = get_xid(); cifs_dbg(FYI, "parent inode = 0x%p name is: %pd and dentry = 0x%p\n", @@ -516,6 +517,11 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, } tcon = tlink_tcon(tlink); + + rc = check_name(direntry, tcon); + if (rc) + goto out_free_xid; + server = tcon->ses->server; if (server->ops->new_lease_key) @@ -776,7 +782,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } pTcon = tlink_tcon(tlink); - rc = check_name(direntry); + rc = check_name(direntry, pTcon); if (rc) goto lookup_out; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 5fb2fc2d0080..7aa67206f6da 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -514,7 +514,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) * No tcon so can't do * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]); */ - if (rc != 0) + if (rc == -EOPNOTSUPP) { + cifs_dbg(VFS, "Dialect not supported by server. Consider " + "specifying vers=1.0 or vers=2.1 on mount for accessing" + " older servers\n"); + goto neg_exit; + } else if (rc != 0) goto neg_exit; cifs_dbg(FYI, "mode 0x%x\n", rsp->SecurityMode); @@ -3219,8 +3224,8 @@ copy_fs_info_to_kstatfs(struct smb2_fs_full_size_info *pfs_inf, kst->f_bsize = le32_to_cpu(pfs_inf->BytesPerSector) * le32_to_cpu(pfs_inf->SectorsPerAllocationUnit); kst->f_blocks = le64_to_cpu(pfs_inf->TotalAllocationUnits); - kst->f_bfree = le64_to_cpu(pfs_inf->ActualAvailableAllocationUnits); - kst->f_bavail = le64_to_cpu(pfs_inf->CallerAvailableAllocationUnits); + kst->f_bfree = kst->f_bavail = + le64_to_cpu(pfs_inf->CallerAvailableAllocationUnits); return; } diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 18700fd25a0b..2826882c81d1 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -84,8 +84,8 @@ #define NUMBER_OF_SMB2_COMMANDS 0x0013 -/* BB FIXME - analyze following length BB */ -#define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */ +/* 4 len + 52 transform hdr + 64 hdr + 56 create rsp */ +#define MAX_SMB2_HDR_SIZE 0x00b0 #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) #define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd) @@ -646,11 +646,10 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, pte_t pte, *ptep = NULL; pmd_t *pmdp = NULL; spinlock_t *ptl; - bool changed; i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { - unsigned long address; + unsigned long address, start, end; cond_resched(); @@ -658,8 +657,13 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, continue; address = pgoff_address(index, vma); - changed = false; - if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl)) + + /* + * Note because we provide start/end to follow_pte_pmd it will + * call mmu_notifier_invalidate_range_start() on our behalf + * before taking any lock. + */ + if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl)) continue; if (pmdp) { @@ -676,7 +680,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, pmd = pmd_wrprotect(pmd); pmd = pmd_mkclean(pmd); set_pmd_at(vma->vm_mm, address, pmdp, pmd); - changed = true; + mmu_notifier_invalidate_range(vma->vm_mm, start, end); unlock_pmd: spin_unlock(ptl); #endif @@ -691,13 +695,12 @@ unlock_pmd: pte = pte_wrprotect(pte); pte = pte_mkclean(pte); set_pte_at(vma->vm_mm, address, ptep, pte); - changed = true; + mmu_notifier_invalidate_range(vma->vm_mm, start, end); unlock_pte: pte_unmap_unlock(ptep, ptl); } - if (changed) - mmu_notifier_invalidate_page(vma->vm_mm, address); + mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); } i_mmap_unlock_read(mapping); } @@ -1383,6 +1386,16 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); + /* + * Make sure that the faulting address's PMD offset (color) matches + * the PMD offset from the start of the file. This is necessary so + * that a PMD range in the page table overlaps exactly with a PMD + * range in the radix tree. + */ + if ((vmf->pgoff & PG_PMD_COLOUR) != + ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) + goto fallback; + /* Fall back to PTEs if we're going to COW */ if (write && !(vma->vm_flags & VM_SHARED)) goto fallback; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 44dfbca9306f..7eae33ffa3fc 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -133,7 +133,51 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb) return sb->s_fs_info; } -struct pts_fs_info *devpts_acquire(struct file *filp, struct vfsmount **ptsmnt) +static int devpts_ptmx_path(struct path *path) +{ + struct super_block *sb; + int err; + + /* Has the devpts filesystem already been found? */ + if (path->mnt->mnt_sb->s_magic == DEVPTS_SUPER_MAGIC) + return 0; + + /* Is a devpts filesystem at "pts" in the same directory? */ + err = path_pts(path); + if (err) + return err; + + /* Is the path the root of a devpts filesystem? */ + sb = path->mnt->mnt_sb; + if ((sb->s_magic != DEVPTS_SUPER_MAGIC) || + (path->mnt->mnt_root != sb->s_root)) + return -ENODEV; + + return 0; +} + +struct vfsmount *devpts_mntget(struct file *filp, struct pts_fs_info *fsi) +{ + struct path path; + int err; + + path = filp->f_path; + path_get(&path); + + err = devpts_ptmx_path(&path); + dput(path.dentry); + if (err) { + mntput(path.mnt); + path.mnt = ERR_PTR(err); + } + if (DEVPTS_SB(path.mnt->mnt_sb) != fsi) { + mntput(path.mnt); + path.mnt = ERR_PTR(-ENODEV); + } + return path.mnt; +} + +struct pts_fs_info *devpts_acquire(struct file *filp) { struct pts_fs_info *result; struct path path; @@ -142,31 +186,18 @@ struct pts_fs_info *devpts_acquire(struct file *filp, struct vfsmount **ptsmnt) path = filp->f_path; path_get(&path); - *ptsmnt = NULL; - /* Has the devpts filesystem already been found? */ - sb = path.mnt->mnt_sb; - if (sb->s_magic != DEVPTS_SUPER_MAGIC) { - /* Is a devpts filesystem at "pts" in the same directory? */ - err = path_pts(&path); - if (err) { - result = ERR_PTR(err); - goto out; - } - - /* Is the path the root of a devpts filesystem? */ - result = ERR_PTR(-ENODEV); - sb = path.mnt->mnt_sb; - if ((sb->s_magic != DEVPTS_SUPER_MAGIC) || - (path.mnt->mnt_root != sb->s_root)) - goto out; + err = devpts_ptmx_path(&path); + if (err) { + result = ERR_PTR(err); + goto out; } /* * pty code needs to hold extra references in case of last /dev/tty close */ + sb = path.mnt->mnt_sb; atomic_inc(&sb->s_active); - *ptsmnt = mntget(path.mnt); result = DEVPTS_SB(sb); out: diff --git a/fs/eventpoll.c b/fs/eventpoll.c index e767e4389cb1..adbe328b957c 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -600,8 +600,13 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq) wait_queue_head_t *whead; rcu_read_lock(); - /* If it is cleared by POLLFREE, it should be rcu-safe */ - whead = rcu_dereference(pwq->whead); + /* + * If it is cleared by POLLFREE, it should be rcu-safe. + * If we read NULL we need a barrier paired with + * smp_store_release() in ep_poll_callback(), otherwise + * we rely on whead->lock. + */ + whead = smp_load_acquire(&pwq->whead); if (whead) remove_wait_queue(whead, &pwq->wait); rcu_read_unlock(); @@ -1134,17 +1139,6 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v struct eventpoll *ep = epi->ep; int ewake = 0; - if ((unsigned long)key & POLLFREE) { - ep_pwq_from_wait(wait)->whead = NULL; - /* - * whead = NULL above can race with ep_remove_wait_queue() - * which can do another remove_wait_queue() after us, so we - * can't use __remove_wait_queue(). whead->lock is held by - * the caller. - */ - list_del_init(&wait->entry); - } - spin_lock_irqsave(&ep->lock, flags); ep_set_busy_poll_napi_id(epi); @@ -1228,10 +1222,26 @@ out_unlock: if (pwake) ep_poll_safewake(&ep->poll_wait); - if (epi->event.events & EPOLLEXCLUSIVE) - return ewake; + if (!(epi->event.events & EPOLLEXCLUSIVE)) + ewake = 1; + + if ((unsigned long)key & POLLFREE) { + /* + * If we race with ep_remove_wait_queue() it can miss + * ->whead = NULL and do another remove_wait_queue() after + * us, so we can't use __remove_wait_queue(). + */ + list_del_init(&wait->entry); + /* + * ->whead != NULL protects us from the race with ep_free() + * or ep_remove(), ep_remove_wait_queue() takes whead->lock + * held by the caller. Once we nullify it, nothing protects + * ep/epi or even wait. + */ + smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); + } - return 1; + return ewake; } /* diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 5a1052627a81..701085620cd8 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2300,7 +2300,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) EXT4_MAX_BLOCK_LOG_SIZE); struct sg { struct ext4_group_info info; - ext4_grpblk_t counters[blocksize_bits + 2]; + ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; } sg; group--; @@ -2309,6 +2309,9 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); + i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + + sizeof(struct ext4_group_info); + grinfo = ext4_get_group_info(sb, group); /* Load the group info in memory only if not already loaded. */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { @@ -2320,7 +2323,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) buddy_loaded = 1; } - memcpy(&sg, ext4_get_group_info(sb, group), sizeof(sg)); + memcpy(&sg, ext4_get_group_info(sb, group), i); if (buddy_loaded) ext4_mb_unload_buddy(&e4b); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 82a5af9f6668..3dd970168448 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1543,7 +1543,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, /* Clear padding bytes. */ memset(val + i->value_len, 0, new_size - i->value_len); } - return 0; + goto update_hash; } /* Compute min_offs and last. */ @@ -1707,6 +1707,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, here->e_value_size = cpu_to_le32(i->value_len); } +update_hash: if (i->value) { __le32 hash = 0; @@ -1725,7 +1726,8 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, here->e_name_len, &crc32c_hash, 1); } else if (is_block) { - __le32 *value = s->base + min_offs - new_size; + __le32 *value = s->base + le16_to_cpu( + here->e_value_offs); hash = ext4_xattr_hash_entry(here->e_name, here->e_name_len, value, diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 2524807ee070..9d5eecb123de 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -86,19 +86,6 @@ int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) char *data; const char *name = gfs2_acl_name(type); - if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) - return -E2BIG; - - if (type == ACL_TYPE_ACCESS) { - umode_t mode = inode->i_mode; - - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (error) - return error; - if (mode != inode->i_mode) - mark_inode_dirty(inode); - } - if (acl) { len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0); if (len == 0) @@ -129,6 +116,10 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) struct gfs2_holder gh; bool need_unlock = false; int ret; + umode_t mode; + + if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) + return -E2BIG; ret = gfs2_rsqa_alloc(ip); if (ret) @@ -140,7 +131,20 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) return ret; need_unlock = true; } + + mode = inode->i_mode; + if (type == ACL_TYPE_ACCESS && acl) { + ret = posix_acl_update_mode(inode, &mode, &acl); + if (ret) + goto unlock; + } + ret = __gfs2_set_acl(inode, acl, type); + if (!ret && mode != inode->i_mode) { + inode->i_mode = mode; + mark_inode_dirty(inode); + } +unlock: if (need_unlock) gfs2_glock_dq_uninit(&gh); return ret; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index ed7a2e252ad8..68ed06962537 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -234,7 +234,19 @@ out: static int gfs2_writepages(struct address_space *mapping, struct writeback_control *wbc) { - return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); + struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); + int ret = mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); + + /* + * Even if we didn't write any pages here, we might still be holding + * dirty pages in the ail. We forcibly flush the ail because we don't + * want balance_dirty_pages() to loop indefinitely trying to write out + * pages held in the ail that it can't find. + */ + if (ret == 0) + set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags); + + return ret; } /** diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 9fa3aef9a5b3..3dd0cceefa43 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -291,8 +291,9 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, if (trylock_buffer(rabh)) { if (!buffer_uptodate(rabh)) { rabh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META, - rabh); + submit_bh(REQ_OP_READ, + REQ_RAHEAD | REQ_META | REQ_PRIO, + rabh); continue; } unlock_buffer(rabh); @@ -1103,8 +1104,15 @@ static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp, while (true) { ptr = metapointer(h, mp); - if (*ptr) /* if we have a non-null pointer */ + if (*ptr) { /* if we have a non-null pointer */ + /* Now zero the metapath after the current height. */ + h++; + if (h < GFS2_MAX_META_HEIGHT) + memset(&mp->mp_list[h], 0, + (GFS2_MAX_META_HEIGHT - h) * + sizeof(mp->mp_list[0])); return true; + } if (mp->mp_list[h] < ptrs) mp->mp_list[h]++; @@ -1120,6 +1128,13 @@ enum dealloc_states { DEALLOC_DONE = 3, /* process complete */ }; +static bool mp_eq_to_hgt(struct metapath *mp, __u16 *nbof, unsigned int h) +{ + if (memcmp(mp->mp_list, nbof, h * sizeof(mp->mp_list[0]))) + return false; + return true; +} + /** * trunc_dealloc - truncate a file down to a desired size * @ip: inode to truncate @@ -1197,8 +1212,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize) /* If we're truncating to a non-zero size and the mp is at the beginning of file for the strip height, we need to preserve the first metadata pointer. */ - preserve1 = (newsize && - (mp.mp_list[mp_h] == nbof[mp_h])); + preserve1 = (newsize && mp_eq_to_hgt(&mp, nbof, mp_h)); bh = mp.mp_bh[mp_h]; gfs2_assert_withdraw(sdp, bh); if (gfs2_assert_withdraw(sdp, diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 5ee2e2f8576c..06a0d1947c77 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1513,7 +1513,9 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index, continue; } bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META, bh); + submit_bh(REQ_OP_READ, + REQ_RAHEAD | REQ_META | REQ_PRIO, + bh); continue; } brelse(bh); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c2062a108d19..bb48074be019 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1030,8 +1030,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) mutex_lock(&fp->f_fl_mutex); - gl = fl_gh->gh_gl; - if (gl) { + if (gfs2_holder_initialized(fl_gh)) { if (fl_gh->gh_state == state) goto out; locks_lock_file_wait(file, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c38ab6c81898..98e845b7841b 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -15,6 +15,7 @@ #include <linux/buffer_head.h> #include <linux/delay.h> #include <linux/sort.h> +#include <linux/hash.h> #include <linux/jhash.h> #include <linux/kallsyms.h> #include <linux/gfs2_ondisk.h> @@ -71,7 +72,7 @@ static DEFINE_SPINLOCK(lru_lock); #define GFS2_GL_HASH_SHIFT 15 #define GFS2_GL_HASH_SIZE BIT(GFS2_GL_HASH_SHIFT) -static struct rhashtable_params ht_parms = { +static const struct rhashtable_params ht_parms = { .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4, .key_len = offsetofend(struct lm_lockname, ln_type), .key_offset = offsetof(struct gfs2_glock, gl_name), @@ -80,6 +81,49 @@ static struct rhashtable_params ht_parms = { static struct rhashtable gl_hash_table; +#define GLOCK_WAIT_TABLE_BITS 12 +#define GLOCK_WAIT_TABLE_SIZE (1 << GLOCK_WAIT_TABLE_BITS) +static wait_queue_head_t glock_wait_table[GLOCK_WAIT_TABLE_SIZE] __cacheline_aligned; + +struct wait_glock_queue { + struct lm_lockname *name; + wait_queue_entry_t wait; +}; + +static int glock_wake_function(wait_queue_entry_t *wait, unsigned int mode, + int sync, void *key) +{ + struct wait_glock_queue *wait_glock = + container_of(wait, struct wait_glock_queue, wait); + struct lm_lockname *wait_name = wait_glock->name; + struct lm_lockname *wake_name = key; + + if (wake_name->ln_sbd != wait_name->ln_sbd || + wake_name->ln_number != wait_name->ln_number || + wake_name->ln_type != wait_name->ln_type) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + +static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name) +{ + u32 hash = jhash2((u32 *)name, sizeof(*name) / 4, 0); + + return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS); +} + +/** + * wake_up_glock - Wake up waiters on a glock + * @gl: the glock + */ +static void wake_up_glock(struct gfs2_glock *gl) +{ + wait_queue_head_t *wq = glock_waitqueue(&gl->gl_name); + + if (waitqueue_active(wq)) + __wake_up(wq, TASK_NORMAL, 1, &gl->gl_name); +} + static void gfs2_glock_dealloc(struct rcu_head *rcu) { struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); @@ -96,6 +140,9 @@ void gfs2_glock_free(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms); + smp_mb(); + wake_up_glock(gl); call_rcu(&gl->gl_rcu, gfs2_glock_dealloc); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) wake_up(&sdp->sd_glock_wait); @@ -107,7 +154,7 @@ void gfs2_glock_free(struct gfs2_glock *gl) * */ -static void gfs2_glock_hold(struct gfs2_glock *gl) +void gfs2_glock_hold(struct gfs2_glock *gl) { GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); lockref_get(&gl->gl_lockref); @@ -150,6 +197,9 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl) static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) { + if (!(gl->gl_ops->go_flags & GLOF_LRU)) + return; + spin_lock(&lru_lock); if (!list_empty(&gl->gl_lru)) { list_del_init(&gl->gl_lru); @@ -191,13 +241,20 @@ static void __gfs2_glock_put(struct gfs2_glock *gl) gfs2_glock_remove_from_lru(gl); spin_unlock(&gl->gl_lockref.lock); - rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); GLOCK_BUG_ON(gl, mapping && mapping->nrpages); trace_gfs2_glock_put(gl); sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); } +/* + * Cause the glock to be put in work queue context. + */ +void gfs2_glock_queue_put(struct gfs2_glock *gl) +{ + gfs2_glock_queue_work(gl, 0); +} + /** * gfs2_glock_put() - Decrement reference count on glock * @gl: The glock to put @@ -676,6 +733,40 @@ static void glock_work_func(struct work_struct *work) spin_unlock(&gl->gl_lockref.lock); } +static struct gfs2_glock *find_insert_glock(struct lm_lockname *name, + struct gfs2_glock *new) +{ + struct wait_glock_queue wait; + wait_queue_head_t *wq = glock_waitqueue(name); + struct gfs2_glock *gl; + + wait.name = name; + init_wait(&wait.wait); + wait.wait.func = glock_wake_function; + +again: + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + rcu_read_lock(); + if (new) { + gl = rhashtable_lookup_get_insert_fast(&gl_hash_table, + &new->gl_node, ht_parms); + if (IS_ERR(gl)) + goto out; + } else { + gl = rhashtable_lookup_fast(&gl_hash_table, + name, ht_parms); + } + if (gl && !lockref_get_not_dead(&gl->gl_lockref)) { + rcu_read_unlock(); + schedule(); + goto again; + } +out: + rcu_read_unlock(); + finish_wait(wq, &wait.wait); + return gl; +} + /** * gfs2_glock_get() - Get a glock, or create one if one doesn't exist * @sdp: The GFS2 superblock @@ -702,15 +793,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, struct kmem_cache *cachep; int ret = 0; - rcu_read_lock(); - gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms); - if (gl && !lockref_get_not_dead(&gl->gl_lockref)) - gl = NULL; - rcu_read_unlock(); - - *glp = gl; - if (gl) + gl = find_insert_glock(&name, NULL); + if (gl) { + *glp = gl; return 0; + } if (!create) return -ENOENT; @@ -764,10 +851,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, mapping->writeback_index = 0; } -again: - rcu_read_lock(); - tmp = rhashtable_lookup_get_insert_fast(&gl_hash_table, &gl->gl_node, - ht_parms); + tmp = find_insert_glock(&name, gl); if (!tmp) { *glp = gl; goto out; @@ -776,13 +860,7 @@ again: ret = PTR_ERR(tmp); goto out_free; } - if (lockref_get_not_dead(&tmp->gl_lockref)) { - *glp = tmp; - goto out_free; - } - rcu_read_unlock(); - cond_resched(); - goto again; + *glp = tmp; out_free: kfree(gl->gl_lksb.sb_lvbptr); @@ -790,7 +868,6 @@ out_free: atomic_dec(&sdp->sd_glock_disposal); out: - rcu_read_unlock(); return ret; } @@ -1473,14 +1550,15 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) do { gl = ERR_PTR(rhashtable_walk_start(&iter)); - if (gl) - continue; + if (IS_ERR(gl)) + goto walk_stop; while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) - if ((gl->gl_name.ln_sbd == sdp) && + if (gl->gl_name.ln_sbd == sdp && lockref_get_not_dead(&gl->gl_lockref)) examiner(gl); +walk_stop: rhashtable_walk_stop(&iter); } while (cond_resched(), gl == ERR_PTR(-EAGAIN)); @@ -1803,7 +1881,7 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr) int __init gfs2_glock_init(void) { - int ret; + int i, ret; ret = rhashtable_init(&gl_hash_table, &ht_parms); if (ret < 0) @@ -1832,6 +1910,9 @@ int __init gfs2_glock_init(void) return ret; } + for (i = 0; i < GLOCK_WAIT_TABLE_SIZE; i++) + init_waitqueue_head(glock_wait_table + i); + return 0; } @@ -1860,6 +1941,7 @@ static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi) } static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) { struct gfs2_glock_iter *gi = seq->private; loff_t n = *pos; @@ -1892,6 +1974,7 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, } static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) + __releases(RCU) { struct gfs2_glock_iter *gi = seq->private; diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 9ad4a6ac6c84..5e12220cc0c2 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -13,6 +13,7 @@ #include <linux/sched.h> #include <linux/parser.h> #include "incore.h" +#include "util.h" /* Options for hostdata parser */ @@ -181,7 +182,9 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, int create, struct gfs2_glock **glp); +extern void gfs2_glock_hold(struct gfs2_glock *gl); extern void gfs2_glock_put(struct gfs2_glock *gl); +extern void gfs2_glock_queue_put(struct gfs2_glock *gl); extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags, struct gfs2_holder *gh); extern void gfs2_holder_reinit(unsigned int state, u16 flags, @@ -257,11 +260,44 @@ static inline bool gfs2_holder_initialized(struct gfs2_holder *gh) return gh->gh_gl; } +/** + * glock_set_object - set the gl_object field of a glock + * @gl: the glock + * @object: the object + */ static inline void glock_set_object(struct gfs2_glock *gl, void *object) { spin_lock(&gl->gl_lockref.lock); + if (gfs2_assert_warn(gl->gl_name.ln_sbd, gl->gl_object == NULL)) + gfs2_dump_glock(NULL, gl); gl->gl_object = object; spin_unlock(&gl->gl_lockref.lock); } +/** + * glock_clear_object - clear the gl_object field of a glock + * @gl: the glock + * @object: the object + * + * I'd love to similarly add this: + * else if (gfs2_assert_warn(gl->gl_sbd, gl->gl_object == object)) + * gfs2_dump_glock(NULL, gl); + * Unfortunately, that's not possible because as soon as gfs2_delete_inode + * frees the block in the rgrp, another process can reassign it for an I_NEW + * inode in gfs2_create_inode because that calls new_inode, not gfs2_iget. + * That means gfs2_delete_inode may subsequently try to call this function + * for a glock that's already pointing to a brand new inode. If we clear the + * new inode's gl_object, we'll introduce metadata corruption. Function + * gfs2_delete_inode calls clear_inode which calls gfs2_clear_inode which also + * tries to clear gl_object, so it's more than just gfs2_delete_inode. + * + */ +static inline void glock_clear_object(struct gfs2_glock *gl, void *object) +{ + spin_lock(&gl->gl_lockref.lock); + if (gl->gl_object == object) + gl->gl_object = NULL; + spin_unlock(&gl->gl_lockref.lock); +} + #endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 5e69636d4dd3..dac6559e2195 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -329,32 +329,6 @@ static int inode_go_demote_ok(const struct gfs2_glock *gl) return 1; } -/** - * gfs2_set_nlink - Set the inode's link count based on on-disk info - * @inode: The inode in question - * @nlink: The link count - * - * If the link count has hit zero, it must never be raised, whatever the - * on-disk inode might say. When new struct inodes are created the link - * count is set to 1, so that we can safely use this test even when reading - * in on disk information for the first time. - */ - -static void gfs2_set_nlink(struct inode *inode, u32 nlink) -{ - /* - * We will need to review setting the nlink count here in the - * light of the forthcoming ro bind mount work. This is a reminder - * to do that. - */ - if ((inode->i_nlink != nlink) && (inode->i_nlink != 0)) { - if (nlink == 0) - clear_nlink(inode); - else - set_nlink(inode, nlink); - } -} - static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) { const struct gfs2_dinode *str = buf; @@ -376,7 +350,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid)); i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid)); - gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink)); + set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink)); i_size_write(&ip->i_inode, be64_to_cpu(str->di_size)); gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); atime.tv_sec = be64_to_cpu(str->di_atime); @@ -470,7 +444,7 @@ static int inode_go_lock(struct gfs2_holder *gh) (gh->gh_state == LM_ST_EXCLUSIVE)) { spin_lock(&sdp->sd_trunc_lock); if (list_empty(&ip->i_trunc_list)) - list_add(&sdp->sd_trunc_list, &ip->i_trunc_list); + list_add(&ip->i_trunc_list, &sdp->sd_trunc_list); spin_unlock(&sdp->sd_trunc_lock); wake_up(&sdp->sd_quota_wait); return 1; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 73fce76e67ee..6e18e9793ec4 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -606,6 +606,7 @@ enum { SDF_NOJOURNALID = 6, SDF_RORECOVERY = 7, /* read only recovery */ SDF_SKIP_DLM_UNLOCK = 8, + SDF_FORCE_AIL_FLUSH = 9, }; enum gfs2_freeze_state { @@ -816,6 +817,7 @@ struct gfs2_sbd { atomic_t sd_log_in_flight; struct bio *sd_log_bio; wait_queue_head_t sd_log_flush_wait; + int sd_log_error; atomic_t sd_reserving_log; wait_queue_head_t sd_reserving_log_wait; @@ -831,7 +833,7 @@ struct gfs2_sbd { atomic_t sd_freeze_state; struct mutex sd_freeze_mutex; - char sd_fsname[GFS2_FSNAME_LEN]; + char sd_fsname[GFS2_FSNAME_LEN + 3 * sizeof(int) + 2]; char sd_table_name[GFS2_FSNAME_LEN]; char sd_proto_name[GFS2_FSNAME_LEN]; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index acca501f8110..863749e29bf9 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -109,7 +109,7 @@ static void gfs2_set_iop(struct inode *inode) * @no_addr: The inode number * @no_formal_ino: The inode generation number * @blktype: Requested block type (GFS2_BLKST_DINODE or GFS2_BLKST_UNLINKED; - * GFS2_BLKST_FREE do indicate not to verify) + * GFS2_BLKST_FREE to indicate not to verify) * * If @type is DT_UNKNOWN, the inode type is fetched from disk. * @@ -145,7 +145,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (unlikely(error)) goto fail; flush_delayed_work(&ip->i_gl->gl_work); - glock_set_object(ip->i_gl, ip); error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (unlikely(error)) @@ -170,11 +169,11 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, } } + glock_set_object(ip->i_gl, ip); set_bit(GIF_INVALID, &ip->i_flags); error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) goto fail_put; - flush_delayed_work(&ip->i_iopen_gh.gh_gl->gl_work); glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_put(io_gl); io_gl = NULL; @@ -202,14 +201,14 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, fail_refresh: ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - glock_set_object(ip->i_iopen_gh.gh_gl, NULL); + glock_clear_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_put: if (io_gl) gfs2_glock_put(io_gl); + glock_clear_object(ip->i_gl, ip); if (gfs2_holder_initialized(&i_gh)) gfs2_glock_dq_uninit(&i_gh); - glock_set_object(ip->i_gl, NULL); fail: iget_failed(inode); return ERR_PTR(error); @@ -706,8 +705,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); if (error) goto fail_free_inode; - + flush_delayed_work(&ip->i_gl->gl_work); glock_set_object(ip->i_gl, ip); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); if (error) goto fail_free_inode; @@ -775,14 +775,17 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, return error; fail_gunlock3: + glock_clear_object(io_gl, ip); gfs2_glock_dq_uninit(&ip->i_iopen_gh); gfs2_glock_put(io_gl); fail_gunlock2: if (io_gl) clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); fail_free_inode: - if (ip->i_gl) + if (ip->i_gl) { + glock_clear_object(ip->i_gl, ip); gfs2_glock_put(ip->i_gl); + } gfs2_rsqa_delete(ip, NULL); fail_free_acls: if (default_acl) diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 0515f0a68637..65f33a0ac190 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -23,8 +23,6 @@ #include "sys.h" #include "trace_gfs2.h" -extern struct workqueue_struct *gfs2_control_wq; - /** * gfs2_update_stats - Update time based stats * @mv: Pointer to mean/variance structure to update @@ -1059,6 +1057,7 @@ static void free_recover_size(struct lm_lockstruct *ls) ls->ls_recover_submit = NULL; ls->ls_recover_result = NULL; ls->ls_recover_size = 0; + ls->ls_lvb_bits = NULL; } /* dlm calls before it does lock recovery */ @@ -1175,7 +1174,7 @@ static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid, spin_unlock(&ls->ls_recover_spin); } -const struct dlm_lockspace_ops gdlm_lockspace_ops = { +static const struct dlm_lockspace_ops gdlm_lockspace_ops = { .recover_prep = gdlm_recover_prep, .recover_slot = gdlm_recover_slot, .recover_done = gdlm_recover_done, diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 9a624f694400..f72c44231406 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -898,6 +898,10 @@ static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) { unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free); + + if (test_and_clear_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags)) + return 1; + return used_blocks + atomic_read(&sdp->sd_log_blks_needed) >= atomic_read(&sdp->sd_log_thresh2); } @@ -919,6 +923,15 @@ int gfs2_logd(void *data) while (!kthread_should_stop()) { + /* Check for errors writing to the journal */ + if (sdp->sd_log_error) { + gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: error %d: " + "withdrawing the file system to " + "prevent further damage.\n", + sdp->sd_fsname, sdp->sd_log_error); + } + did_flush = false; if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { gfs2_ail1_empty(sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 3010f9edd177..7dabbe721dba 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -207,8 +207,11 @@ static void gfs2_end_log_write(struct bio *bio) struct page *page; int i; - if (bio->bi_status) - fs_err(sdp, "Error %d writing to log\n", bio->bi_status); + if (bio->bi_status) { + fs_err(sdp, "Error %d writing to journal, jid=%u\n", + bio->bi_status, sdp->sd_jdesc->jd_jid); + wake_up(&sdp->sd_logd_waitq); + } bio_for_each_segment_all(bvec, bio, i) { page = bvec->bv_page; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index fabe1614f879..61ef6c9be816 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -419,8 +419,9 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) { brelse(bh); ret = -EIO; + } else { + *bhp = bh; } - *bhp = bh; return ret; } @@ -452,7 +453,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(REQ_OP_READ, REQ_META, 1, &first_bh); + ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &first_bh); dblock++; extlen--; @@ -461,7 +462,9 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) bh = gfs2_getbuf(gl, dblock, CREATE); if (!buffer_uptodate(bh) && !buffer_locked(bh)) - ll_rw_block(REQ_OP_READ, REQ_RAHEAD | REQ_META, 1, &bh); + ll_rw_block(REQ_OP_READ, + REQ_RAHEAD | REQ_META | REQ_PRIO, + 1, &bh); brelse(bh); dblock++; extlen--; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index e76058d34b74..c0a4b3778f3f 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1113,7 +1113,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent return error; } - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name); + snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s", sdp->sd_table_name); error = gfs2_sys_fs_add(sdp); /* @@ -1159,10 +1159,10 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent } if (sdp->sd_args.ar_spectator) - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", + snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s.s", sdp->sd_table_name); else - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", + snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s.%u", sdp->sd_table_name, sdp->sd_lockstruct.ls_jid); error = init_inodes(sdp, DO); @@ -1388,7 +1388,6 @@ static void gfs2_kill_sb(struct super_block *sb) sdp->sd_root_dir = NULL; sdp->sd_master_dir = NULL; shrink_dcache_sb(sb); - gfs2_delete_debugfs_file(sdp); free_percpu(sdp->sd_lkstats); kill_block_super(sb); } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index c2ca9566b764..e647938432bd 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -730,7 +730,7 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, if (PageUptodate(page)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { - ll_rw_block(REQ_OP_READ, REQ_META, 1, &bh); + ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) goto unlock_out; @@ -1474,8 +1474,11 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) { if (error == 0 || error == -EROFS) return; - if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) { fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error); + sdp->sd_log_error = error; + wake_up(&sdp->sd_logd_waitq); + } } static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg, diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 836e38ba5d0a..95b2a57ded33 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -705,8 +705,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) rb_erase(n, &sdp->sd_rindex_tree); if (gl) { - glock_set_object(gl, NULL); - gfs2_glock_add_to_lru(gl); + glock_clear_object(gl, rgd); gfs2_glock_put(gl); } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index fdedec379b78..769841185ce5 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -924,6 +924,7 @@ restart: gfs2_jindex_free(sdp); /* Take apart glock structures and buffer lists */ gfs2_gl_hash_clear(sdp); + gfs2_delete_debugfs_file(sdp); /* Unmount the locking protocol */ gfs2_lm_unmount(sdp); @@ -943,9 +944,9 @@ static int gfs2_sync_fs(struct super_block *sb, int wait) struct gfs2_sbd *sdp = sb->s_fs_info; gfs2_quota_sync(sb, -1); - if (wait && sdp) + if (wait) gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); - return 0; + return sdp->sd_log_error; } void gfs2_freeze_func(struct work_struct *work) @@ -1295,7 +1296,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) * gfs2_drop_inode - Drop an inode (test for remote unlink) * @inode: The inode to drop * - * If we've received a callback on an iopen lock then its because a + * If we've received a callback on an iopen lock then it's because a * remote node tried to deallocate the inode but failed due to this node * still having the inode open. Here we mark the link count zero * since we know that it must have reached zero if the GLF_DEMOTE flag @@ -1317,6 +1318,23 @@ static int gfs2_drop_inode(struct inode *inode) if (test_bit(GLF_DEMOTE, &gl->gl_flags)) clear_nlink(inode); } + + /* + * When under memory pressure when an inode's link count has dropped to + * zero, defer deleting the inode to the delete workqueue. This avoids + * calling into DLM under memory pressure, which can deadlock. + */ + if (!inode->i_nlink && + unlikely(current->flags & PF_MEMALLOC) && + gfs2_holder_initialized(&ip->i_iopen_gh)) { + struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; + + gfs2_glock_hold(gl); + if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + gfs2_glock_queue_put(gl); + return false; + } + return generic_drop_inode(inode); } @@ -1501,6 +1519,22 @@ out_qs: } /** + * gfs2_glock_put_eventually + * @gl: The glock to put + * + * When under memory pressure, trigger a deferred glock put to make sure we + * won't call into DLM and deadlock. Otherwise, put the glock directly. + */ + +static void gfs2_glock_put_eventually(struct gfs2_glock *gl) +{ + if (current->flags & PF_MEMALLOC) + gfs2_glock_queue_put(gl); + else + gfs2_glock_put(gl); +} + +/** * gfs2_evict_inode - Remove an inode from cache * @inode: The inode to evict * @@ -1544,9 +1578,14 @@ static void gfs2_evict_inode(struct inode *inode) goto alloc_failed; } + /* Deletes should never happen under memory pressure anymore. */ + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) + goto out; + /* Must not read inode block until block type has been verified */ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (unlikely(error)) { + glock_clear_object(ip->i_iopen_gh.gh_gl, ip); ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); goto out; @@ -1562,6 +1601,12 @@ static void gfs2_evict_inode(struct inode *inode) goto out_truncate; } + /* + * The inode may have been recreated in the meantime. + */ + if (inode->i_nlink) + goto out_truncate; + alloc_failed: if (gfs2_holder_initialized(&ip->i_iopen_gh) && test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { @@ -1595,6 +1640,11 @@ alloc_failed: goto out_unlock; } + /* We're about to clear the bitmap for the dinode, but as soon as we + do, gfs2_create_inode can create another inode at the same block + location and try to set gl_object again. We clear gl_object here so + that subsequent inode creates don't see an old gl_object. */ + glock_clear_object(ip->i_gl, ip); error = gfs2_dinode_dealloc(ip); goto out_unlock; @@ -1623,14 +1673,17 @@ out_unlock: gfs2_rs_deltree(&ip->i_res); if (gfs2_holder_initialized(&ip->i_iopen_gh)) { + glock_clear_object(ip->i_iopen_gh.gh_gl, ip); if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq(&ip->i_iopen_gh); } gfs2_holder_uninit(&ip->i_iopen_gh); } - if (gfs2_holder_initialized(&gh)) + if (gfs2_holder_initialized(&gh)) { + glock_clear_object(ip->i_gl, ip); gfs2_glock_dq_uninit(&gh); + } if (error && error != GLR_TRYFAILED && error != -EROFS) fs_warn(sdp, "gfs2_evict_inode: %d\n", error); out: @@ -1640,15 +1693,19 @@ out: gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); - glock_set_object(ip->i_gl, NULL); + glock_clear_object(ip->i_gl, ip); wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE); gfs2_glock_add_to_lru(ip->i_gl); - gfs2_glock_put(ip->i_gl); + gfs2_glock_put_eventually(ip->i_gl); ip->i_gl = NULL; if (gfs2_holder_initialized(&ip->i_iopen_gh)) { - glock_set_object(ip->i_iopen_gh.gh_gl, NULL); + struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; + + glock_clear_object(gl, ip); ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_hold(gl); gfs2_glock_dq_uninit(&ip->i_iopen_gh); + gfs2_glock_put_eventually(gl); } } diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index c81295f407f6..3926f95a6eb7 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -151,6 +151,7 @@ extern struct kmem_cache *gfs2_rgrpd_cachep; extern struct kmem_cache *gfs2_quotad_cachep; extern struct kmem_cache *gfs2_qadata_cachep; extern mempool_t *gfs2_page_pool; +extern struct workqueue_struct *gfs2_control_wq; static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, unsigned int *p) diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 54179554c7d2..ea09e41dbb49 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -25,6 +25,7 @@ #include "meta_io.h" #include "quota.h" #include "rgrp.h" +#include "super.h" #include "trans.h" #include "util.h" @@ -1209,8 +1210,12 @@ int __gfs2_xattr_set(struct inode *inode, const char *name, if (namel > GFS2_EA_MAX_NAME_LEN) return -ERANGE; - if (value == NULL) - return gfs2_xattr_remove(ip, type, name); + if (value == NULL) { + error = gfs2_xattr_remove(ip, type, name); + if (error == -ENODATA && !(flags & XATTR_REPLACE)) + error = 0; + return error; + } if (ea_check_size(sdp, namel, size)) return -ERANGE; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 78b41e1d5c67..60726ae7cf26 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -619,16 +619,10 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) if (!sb->s_root) goto out_no_root; - /* logical blocks are represented by 40 bits in pxd_t, etc. */ - sb->s_maxbytes = ((u64) sb->s_blocksize) << 40; -#if BITS_PER_LONG == 32 - /* - * Page cache is indexed by long. - * I would use MAX_LFS_FILESIZE, but it's only half as big + /* logical blocks are represented by 40 bits in pxd_t, etc. + * and page cache is indexed by long */ - sb->s_maxbytes = min(((u64) PAGE_SIZE << 32) - 1, - (u64)sb->s_maxbytes); -#endif + sb->s_maxbytes = min(((loff_t)sb->s_blocksize) << 40, MAX_LFS_FILESIZE); sb->s_time_gran = 1; return 0; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index ac2dfe0c5a9c..e6c8954a4e89 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -997,7 +997,7 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, #ifdef CONFIG_DEBUG_LOCK_ALLOC if (key) { - lockdep_init_map(&kn->dep_map, "s_active", key, 0); + lockdep_init_map(&kn->dep_map, "kn->count", key, 0); kn->flags |= KERNFS_LOCKDEP; } #endif diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 20fbcab97753..5f940d2a136b 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -144,7 +144,7 @@ static void next_decode_page(struct nfsd4_compoundargs *argp) argp->p = page_address(argp->pagelist[0]); argp->pagelist++; if (argp->pagelen < PAGE_SIZE) { - argp->end = argp->p + (argp->pagelen>>2); + argp->end = argp->p + XDR_QUADLEN(argp->pagelen); argp->pagelen = 0; } else { argp->end = argp->p + (PAGE_SIZE>>2); @@ -1279,9 +1279,7 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) argp->pagelen -= pages * PAGE_SIZE; len -= pages * PAGE_SIZE; - argp->p = (__be32 *)page_address(argp->pagelist[0]); - argp->pagelist++; - argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE); + next_decode_page(argp); } argp->p += XDR_QUADLEN(len); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 38d0383dc7f9..bc69d40c4e8b 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -969,7 +969,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, int use_wgather; loff_t pos = offset; unsigned int pflags = current->flags; - int flags = 0; + rwf_t flags = 0; if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) /* diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 3d424a51cabb..f0fd3adb1693 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -446,14 +446,14 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, ovl_path_upper(dentry, &upperpath); realfile = ovl_path_open(&upperpath, O_RDONLY); - smp_mb__before_spinlock(); + inode_lock(inode); if (!od->upperfile) { if (IS_ERR(realfile)) { inode_unlock(inode); return PTR_ERR(realfile); } - od->upperfile = realfile; + smp_store_release(&od->upperfile, realfile); } else { /* somebody has beaten us to it */ if (!IS_ERR(realfile)) diff --git a/fs/proc/base.c b/fs/proc/base.c index 719c2e943ea1..98fd8f6df851 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1408,12 +1408,13 @@ static const struct file_operations proc_fail_nth_operations = { static int sched_show(struct seq_file *m, void *v) { struct inode *inode = m->private; + struct pid_namespace *ns = inode->i_sb->s_fs_info; struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; - proc_sched_show_task(p, m); + proc_sched_show_task(p, ns, m); put_task_struct(p); diff --git a/fs/proc/devices.c b/fs/proc/devices.c index 50493edc30e5..e5709343feb7 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -7,14 +7,14 @@ static int devinfo_show(struct seq_file *f, void *v) { int i = *(loff_t *) v; - if (i < CHRDEV_MAJOR_HASH_SIZE) { + if (i < CHRDEV_MAJOR_MAX) { if (i == 0) seq_puts(f, "Character devices:\n"); chrdev_show(f, i); } #ifdef CONFIG_BLOCK else { - i -= CHRDEV_MAJOR_HASH_SIZE; + i -= CHRDEV_MAJOR_MAX; if (i == 0) seq_puts(f, "\nBlock devices:\n"); blkdev_show(f, i); @@ -25,7 +25,7 @@ static int devinfo_show(struct seq_file *f, void *v) static void *devinfo_start(struct seq_file *f, loff_t *pos) { - if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) + if (*pos < (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX)) return pos; return NULL; } @@ -33,7 +33,7 @@ static void *devinfo_start(struct seq_file *f, loff_t *pos) static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos) { (*pos)++; - if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) + if (*pos >= (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX)) return NULL; return pos; } diff --git a/fs/read_write.c b/fs/read_write.c index 0cc7033aa413..61b58c7b6531 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -33,7 +33,7 @@ const struct file_operations generic_ro_fops = { EXPORT_SYMBOL(generic_ro_fops); -static inline int unsigned_offsets(struct file *file) +static inline bool unsigned_offsets(struct file *file) { return file->f_mode & FMODE_UNSIGNED_OFFSET; } @@ -633,7 +633,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) EXPORT_SYMBOL(iov_shorten); static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, - loff_t *ppos, int type, int flags) + loff_t *ppos, int type, rwf_t flags) { struct kiocb kiocb; ssize_t ret; @@ -655,7 +655,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, /* Do it by hand, with file-ops */ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, - loff_t *ppos, int type, int flags) + loff_t *ppos, int type, rwf_t flags) { ssize_t ret = 0; @@ -871,7 +871,7 @@ out: #endif static ssize_t do_iter_read(struct file *file, struct iov_iter *iter, - loff_t *pos, int flags) + loff_t *pos, rwf_t flags) { size_t tot_len; ssize_t ret = 0; @@ -899,7 +899,7 @@ out: } ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, - int flags) + rwf_t flags) { if (!file->f_op->read_iter) return -EINVAL; @@ -908,7 +908,7 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, EXPORT_SYMBOL(vfs_iter_read); static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, - loff_t *pos, int flags) + loff_t *pos, rwf_t flags) { size_t tot_len; ssize_t ret = 0; @@ -935,7 +935,7 @@ static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, } ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, - int flags) + rwf_t flags) { if (!file->f_op->write_iter) return -EINVAL; @@ -944,7 +944,7 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, EXPORT_SYMBOL(vfs_iter_write); ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, - unsigned long vlen, loff_t *pos, int flags) + unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; @@ -962,7 +962,7 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, EXPORT_SYMBOL(vfs_readv); ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, - unsigned long vlen, loff_t *pos, int flags) + unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; @@ -981,7 +981,7 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, EXPORT_SYMBOL(vfs_writev); static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, - unsigned long vlen, int flags) + unsigned long vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; @@ -1001,7 +1001,7 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, } static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, - unsigned long vlen, int flags) + unsigned long vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; @@ -1027,7 +1027,7 @@ static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) } static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, - unsigned long vlen, loff_t pos, int flags) + unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret = -EBADF; @@ -1050,7 +1050,7 @@ static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, } static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, - unsigned long vlen, loff_t pos, int flags) + unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret = -EBADF; @@ -1094,7 +1094,7 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, - int, flags) + rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); @@ -1114,7 +1114,7 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, - int, flags) + rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); @@ -1127,7 +1127,7 @@ SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, #ifdef CONFIG_COMPAT static size_t compat_readv(struct file *file, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos, int flags) + unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; @@ -1147,7 +1147,7 @@ static size_t compat_readv(struct file *file, static size_t do_compat_readv(compat_ulong_t fd, const struct compat_iovec __user *vec, - compat_ulong_t vlen, int flags) + compat_ulong_t vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret; @@ -1173,7 +1173,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, static long do_compat_preadv64(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos, int flags) + unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret; @@ -1211,7 +1211,7 @@ COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, const struct compat_iovec __user *,vec, - unsigned long, vlen, loff_t, pos, int, flags) + unsigned long, vlen, loff_t, pos, rwf_t, flags) { return do_compat_preadv64(fd, vec, vlen, pos, flags); } @@ -1220,7 +1220,7 @@ COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, const struct compat_iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, - int, flags) + rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; @@ -1232,7 +1232,7 @@ COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, static size_t compat_writev(struct file *file, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos, int flags) + unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; @@ -1254,7 +1254,7 @@ static size_t compat_writev(struct file *file, static size_t do_compat_writev(compat_ulong_t fd, const struct compat_iovec __user* vec, - compat_ulong_t vlen, int flags) + compat_ulong_t vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret; @@ -1279,7 +1279,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, static long do_compat_pwritev64(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos, int flags) + unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret; @@ -1317,7 +1317,7 @@ COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, const struct compat_iovec __user *,vec, - unsigned long, vlen, loff_t, pos, int, flags) + unsigned long, vlen, loff_t, pos, rwf_t, flags) { return do_compat_pwritev64(fd, vec, vlen, pos, flags); } @@ -1325,7 +1325,7 @@ COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, const struct compat_iovec __user *,vec, - compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags) + compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; diff --git a/fs/select.c b/fs/select.c index 9d5f15ed87fe..c6362e38ae92 100644 --- a/fs/select.c +++ b/fs/select.c @@ -1164,11 +1164,7 @@ int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, if (ufdset) { return compat_get_bitmap(fdset, ufdset, nr); } else { - /* Tricky, must clear full unsigned long in the - * kernel fdset at the end, ALIGN makes sure that - * actually happens. - */ - memset(fdset, 0, ALIGN(nr, BITS_PER_LONG)); + zero_fd_set(nr, fdset); return 0; } } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index b0d5897bc4e6..886085b47c75 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -109,27 +109,24 @@ static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, goto out; WRITE_ONCE(uwq->waken, true); /* - * The implicit smp_mb__before_spinlock in try_to_wake_up() - * renders uwq->waken visible to other CPUs before the task is - * waken. + * The Program-Order guarantees provided by the scheduler + * ensure uwq->waken is visible before the task is woken. */ ret = wake_up_state(wq->private, mode); - if (ret) + if (ret) { /* * Wake only once, autoremove behavior. * - * After the effect of list_del_init is visible to the - * other CPUs, the waitqueue may disappear from under - * us, see the !list_empty_careful() in - * handle_userfault(). try_to_wake_up() has an - * implicit smp_mb__before_spinlock, and the - * wq->private is read before calling the extern - * function "wake_up_state" (which in turns calls - * try_to_wake_up). While the spin_lock;spin_unlock; - * wouldn't be enough, the smp_mb__before_spinlock is - * enough to avoid an explicit smp_mb() here. + * After the effect of list_del_init is visible to the other + * CPUs, the waitqueue may disappear from under us, see the + * !list_empty_careful() in handle_userfault(). + * + * try_to_wake_up() has an implicit smp_mb(), and the + * wq->private is read before calling the extern function + * "wake_up_state" (which in turns calls try_to_wake_up). */ list_del_init(&wq->entry); + } out: return ret; } |