diff options
Diffstat (limited to 'fs')
487 files changed, 19919 insertions, 36724 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 8aa56bb6e861..6caca025019d 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -52,7 +52,7 @@ enum { /* Options that take integer arguments */ Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid, /* String options */ - Opt_uname, Opt_remotename, Opt_trans, Opt_cache, Opt_cachetag, + Opt_uname, Opt_remotename, Opt_cache, Opt_cachetag, /* Options that take no arguments */ Opt_nodevmap, /* Cache options */ diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 1ef16bd8280b..3abc447783aa 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -381,7 +381,7 @@ static ssize_t v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct p9_fid *fid = iocb->ki_filp->private_data; - int ret, err; + int ret, err = 0; p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n", iov_iter_count(to), iocb->ki_pos); diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 510040b04c96..b1dc51888048 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -540,8 +540,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, unlock_new_inode(inode); return inode; error: - unlock_new_inode(inode); - iput(inode); + iget_failed(inode); return ERR_PTR(retval); } diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 09e4433717b8..e8aa57dc8d6d 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -149,8 +149,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, unlock_new_inode(inode); return inode; error: - unlock_new_inode(inode); - iput(inode); + iget_failed(inode); return ERR_PTR(retval); } diff --git a/fs/Kconfig b/fs/Kconfig index 011f43365d7b..da3f32f1a4e4 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -11,18 +11,15 @@ config DCACHE_WORD_ACCESS if BLOCK source "fs/ext2/Kconfig" -source "fs/ext3/Kconfig" source "fs/ext4/Kconfig" -source "fs/jbd/Kconfig" source "fs/jbd2/Kconfig" config FS_MBCACHE # Meta block cache for Extended Attributes (ext2/ext3/ext4) tristate default y if EXT2_FS=y && EXT2_FS_XATTR - default y if EXT3_FS=y && EXT3_FS_XATTR default y if EXT4_FS=y - default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS + default m if EXT2_FS_XATTR || EXT4_FS source "fs/reiserfs/Kconfig" source "fs/jfs/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index cb20e4bf2303..f79cf4043e60 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o +obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_AIO) += aio.o obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FILE_LOCKING) += locks.o @@ -62,12 +63,10 @@ obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line obj-$(CONFIG_FSCACHE) += fscache/ obj-$(CONFIG_REISERFS_FS) += reiserfs/ -obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 obj-$(CONFIG_EXT2_FS) += ext2/ # We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2 # unless explicitly requested by rootfstype obj-$(CONFIG_EXT4_FS) += ext4/ -obj-$(CONFIG_JBD) += jbd/ obj-$(CONFIG_JBD2) += jbd2/ obj-$(CONFIG_CRAMFS) += cramfs/ obj-$(CONFIG_SQUASHFS) += squashfs/ diff --git a/fs/adfs/super.c b/fs/adfs/super.c index a19c31d3f369..4d4a0df8344f 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -242,7 +242,7 @@ static struct kmem_cache *adfs_inode_cachep; static struct inode *adfs_alloc_inode(struct super_block *sb) { struct adfs_inode_info *ei; - ei = (struct adfs_inode_info *)kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/affs/affs.h b/fs/affs/affs.h index cffe8370fb44..c69a87eaf57d 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -64,7 +64,7 @@ struct affs_inode_info { /* short cut to get to the affs specific inode data */ static inline struct affs_inode_info *AFFS_I(struct inode *inode) { - return list_entry(inode, struct affs_inode_info, vfs_inode); + return container_of(inode, struct affs_inode_info, vfs_inode); } /* diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index a8f463c028ce..5fa92bc790ef 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -140,7 +140,7 @@ affs_remove_link(struct dentry *dentry) { struct inode *dir, *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; - struct buffer_head *bh = NULL, *link_bh = NULL; + struct buffer_head *bh, *link_bh = NULL; u32 link_ino, ino; int retval; diff --git a/fs/affs/inode.c b/fs/affs/inode.c index a022f4accd76..17349500592d 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -346,7 +346,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3 { struct super_block *sb = dir->i_sb; struct buffer_head *inode_bh = NULL; - struct buffer_head *bh = NULL; + struct buffer_head *bh; u32 block = 0; int retval; diff --git a/fs/affs/super.c b/fs/affs/super.c index 3f89c9e05b40..5b50c4ca43a7 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -18,6 +18,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/writeback.h> +#include <linux/blkdev.h> #include "affs.h" static int affs_statfs(struct dentry *dentry, struct kstatfs *buf); @@ -352,18 +353,19 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) * blocks, we will have to change it. */ - size = sb->s_bdev->bd_inode->i_size >> 9; + size = i_size_read(sb->s_bdev->bd_inode) >> 9; pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size); affs_set_blocksize(sb, PAGE_SIZE); /* Try to find root block. Its location depends on the block size. */ - i = 512; - j = 4096; + i = bdev_logical_block_size(sb->s_bdev); + j = PAGE_SIZE; if (blocksize > 0) { i = j = blocksize; size = size / (blocksize / 512); } + for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) { sbi->s_root_block = root_block; if (root_block < 0) diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index f39b71c3981e..ea5b69a18ba9 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -16,14 +16,12 @@ static int affs_symlink_readpage(struct file *file, struct page *page) struct inode *inode = page->mapping->host; char *link = kmap(page); struct slink_front *lf; - int err; int i, j; char c; char lc; pr_debug("follow_link(ino=%lu)\n", inode->i_ino); - err = -EIO; bh = affs_bread(inode->i_sb, inode->i_ino); if (!bh) goto fail; @@ -66,7 +64,7 @@ fail: SetPageError(page); kunmap(page); unlock_page(page); - return err; + return -EIO; } const struct address_space_operations affs_symlink_aops = { @@ -308,15 +308,9 @@ static void aio_free_ring(struct kioctx *ctx) } } -static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) -{ - vma->vm_flags |= VM_DONTEXPAND; - vma->vm_ops = &generic_file_vm_ops; - return 0; -} - -static int aio_ring_remap(struct file *file, struct vm_area_struct *vma) +static int aio_ring_mremap(struct vm_area_struct *vma) { + struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; struct kioctx_table *table; int i, res = -EINVAL; @@ -342,9 +336,24 @@ static int aio_ring_remap(struct file *file, struct vm_area_struct *vma) return res; } +static const struct vm_operations_struct aio_ring_vm_ops = { + .mremap = aio_ring_mremap, +#if IS_ENABLED(CONFIG_MMU) + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = filemap_page_mkwrite, +#endif +}; + +static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_flags |= VM_DONTEXPAND; + vma->vm_ops = &aio_ring_vm_ops; + return 0; +} + static const struct file_operations aio_ring_fops = { .mmap = aio_ring_mmap, - .mremap = aio_ring_remap, }; #if IS_ENABLED(CONFIG_MIGRATION) diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 5b700ef1e59d..c37149b929be 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -238,11 +238,6 @@ static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi) return d_inode(sbi->sb->s_root)->i_ino; } -static inline int simple_positive(struct dentry *dentry) -{ - return d_really_is_positive(dentry) && !d_unhashed(dentry); -} - static inline void __autofs4_add_expiring(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); diff --git a/fs/befs/befs.h b/fs/befs/befs.h index 1fead8d56a98..35d19e8731e3 100644 --- a/fs/befs/befs.h +++ b/fs/befs/befs.h @@ -112,7 +112,7 @@ BEFS_SB(const struct super_block *super) static inline struct befs_inode_info * BEFS_I(const struct inode *inode) { - return list_entry(inode, struct befs_inode_info, vfs_inode); + return container_of(inode, struct befs_inode_info, vfs_inode); } static inline befs_blocknr_t diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index cd46e4158830..6b659967898e 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1530,7 +1530,7 @@ static int fill_files_note(struct memelfnote *note) file = vma->vm_file; if (!file) continue; - filename = d_path(&file->f_path, name_curpos, remaining); + filename = file_path(file, name_curpos, remaining); if (IS_ERR(filename)) { if (PTR_ERR(filename) == -ENAMETOOLONG) { vfree(data); @@ -1540,7 +1540,7 @@ static int fill_files_note(struct memelfnote *note) continue; } - /* d_path() fills at the end, move name down */ + /* file_path() fills at the end, move name down */ /* n = strlen(filename) + 1: */ n = (name_curpos + remaining) - filename; remaining = filename - name_curpos; diff --git a/fs/block_dev.c b/fs/block_dev.c index f04c873a7365..22ea424ee741 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -28,6 +28,7 @@ #include <linux/namei.h> #include <linux/log2.h> #include <linux/cleancache.h> +#include <linux/dax.h> #include <asm/uaccess.h> #include "internal.h" @@ -43,7 +44,7 @@ static inline struct bdev_inode *BDEV_I(struct inode *inode) return container_of(inode, struct bdev_inode, vfs_inode); } -inline struct block_device *I_BDEV(struct inode *inode) +struct block_device *I_BDEV(struct inode *inode) { return &BDEV_I(inode)->bdev; } @@ -152,6 +153,9 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + if (IS_DAX(inode)) + return dax_do_io(iocb, inode, iter, offset, blkdev_get_block, + NULL, DIO_SKIP_DIO_COUNT); return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset, blkdev_get_block, NULL, NULL, DIO_SKIP_DIO_COUNT); @@ -377,7 +381,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, struct page *page) { const struct block_device_operations *ops = bdev->bd_disk->fops; - if (!ops->rw_page) + if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); } @@ -408,7 +412,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, int result; int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; const struct block_device_operations *ops = bdev->bd_disk->fops; - if (!ops->rw_page) + if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; set_page_writeback(page); result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw); @@ -438,11 +442,17 @@ EXPORT_SYMBOL_GPL(bdev_write_page); * accessible at this address. */ long bdev_direct_access(struct block_device *bdev, sector_t sector, - void **addr, unsigned long *pfn, long size) + void __pmem **addr, unsigned long *pfn, long size) { long avail; const struct block_device_operations *ops = bdev->bd_disk->fops; + /* + * The device driver is allowed to sleep, in order to make the + * memory directly accessible. + */ + might_sleep(); + if (size < 0) return size; if (!ops->direct_access) @@ -453,7 +463,7 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector, sector += get_start_sect(bdev); if (sector % (PAGE_SIZE / 512)) return -EINVAL; - avail = ops->direct_access(bdev, sector, addr, pfn, size); + avail = ops->direct_access(bdev, sector, addr, pfn); if (!avail) return -ERANGE; return min(avail, size); @@ -1170,6 +1180,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; + bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0; if (!partno) { ret = -ENXIO; bdev->bd_part = disk_get_part(disk, partno); @@ -1759,7 +1770,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) { struct inode *inode, *old_inode = NULL; - spin_lock(&inode_sb_list_lock); + spin_lock(&blockdev_superblock->s_inode_list_lock); list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { struct address_space *mapping = inode->i_mapping; @@ -1771,13 +1782,13 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&blockdev_superblock->s_inode_list_lock); /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the - * inode_sb_list_lock. We cannot iput the inode now as we can + * s_inode_list_lock We cannot iput the inode now as we can * be holding the last reference and we cannot iput it under - * inode_sb_list_lock. So we keep the reference and iput it + * s_inode_list_lock. So we keep the reference and iput it * later. */ iput(old_inode); @@ -1785,8 +1796,8 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) func(I_BDEV(inode), arg); - spin_lock(&inode_sb_list_lock); + spin_lock(&blockdev_superblock->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&blockdev_superblock->s_inode_list_lock); iput(old_inode); } diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index df9932b00d08..1ce06c849a86 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -85,6 +85,7 @@ BTRFS_WORK_HELPER(extent_refs_helper); BTRFS_WORK_HELPER(scrub_helper); BTRFS_WORK_HELPER(scrubwrc_helper); BTRFS_WORK_HELPER(scrubnc_helper); +BTRFS_WORK_HELPER(scrubparity_helper); static struct __btrfs_workqueue * __btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active, diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index ec2ee477f8ba..b0b093b6afec 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -64,6 +64,8 @@ BTRFS_WORK_HELPER_PROTO(extent_refs_helper); BTRFS_WORK_HELPER_PROTO(scrub_helper); BTRFS_WORK_HELPER_PROTO(scrubwrc_helper); BTRFS_WORK_HELPER_PROTO(scrubnc_helper); +BTRFS_WORK_HELPER_PROTO(scrubparity_helper); + struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, unsigned int flags, diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 614aaa1969bd..ecbc63d3143e 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -206,10 +206,33 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, return -ENOMEM; ref->root_id = root_id; - if (key) + if (key) { ref->key_for_search = *key; - else + /* + * We can often find data backrefs with an offset that is too + * large (>= LLONG_MAX, maximum allowed file offset) due to + * underflows when subtracting a file's offset with the data + * offset of its corresponding extent data item. This can + * happen for example in the clone ioctl. + * So if we detect such case we set the search key's offset to + * zero to make sure we will find the matching file extent item + * at add_all_parents(), otherwise we will miss it because the + * offset taken form the backref is much larger then the offset + * of the file extent item. This can make us scan a very large + * number of file extent items, but at least it will not make + * us miss any. + * This is an ugly workaround for a behaviour that should have + * never existed, but it does and a fix for the clone ioctl + * would touch a lot of places, cause backwards incompatibility + * and would not fix the problem for extents cloned with older + * kernels. + */ + if (ref->key_for_search.type == BTRFS_EXTENT_DATA_KEY && + ref->key_for_search.offset >= LLONG_MAX) + ref->key_for_search.offset = 0; + } else { memset(&ref->key_for_search, 0, sizeof(ref->key_for_search)); + } ref->inode_list = NULL; ref->level = level; @@ -250,8 +273,12 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, * the first item to check. But sometimes, we may enter it with * slot==nritems. In that case, go to the next leaf before we continue. */ - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) - ret = btrfs_next_old_leaf(root, path, time_seq); + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + if (time_seq == (u64)-1) + ret = btrfs_next_leaf(root, path); + else + ret = btrfs_next_old_leaf(root, path, time_seq); + } while (!ret && count < total_refs) { eb = path->nodes[0]; @@ -291,7 +318,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie = NULL; } next: - ret = btrfs_next_old_item(root, path, time_seq); + if (time_seq == (u64)-1) + ret = btrfs_next_item(root, path); + else + ret = btrfs_next_old_item(root, path, time_seq); } if (ret > 0) @@ -334,6 +364,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); + else if (time_seq == (u64)-1) + root_level = btrfs_header_level(root->node); else root_level = btrfs_old_root_level(root, time_seq); @@ -343,7 +375,12 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, } path->lowest_level = level; - ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); + if (time_seq == (u64)-1) + ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path, + 0, 0); + else + ret = btrfs_search_old_slot(root, &ref->key_for_search, path, + time_seq); /* root node has been locked, we can release @subvol_srcu safely here */ srcu_read_unlock(&fs_info->subvol_srcu, index); @@ -491,7 +528,9 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, BUG_ON(!ref->wanted_disk_byte); eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, 0); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + return PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } @@ -507,7 +546,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, } /* - * merge two lists of backrefs and adjust counts accordingly + * merge backrefs and adjust counts accordingly * * mode = 1: merge identical keys, if key is set * FIXME: if we add more keys in __add_prelim_ref, we can merge more here. @@ -535,9 +574,9 @@ static void __merge_refs(struct list_head *head, int mode) ref2 = list_entry(pos2, struct __prelim_ref, list); + if (!ref_for_same_block(ref1, ref2)) + continue; if (mode == 1) { - if (!ref_for_same_block(ref1, ref2)) - continue; if (!ref1->parent && ref2->parent) { xchg = ref1; ref1 = ref2; @@ -572,8 +611,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, struct list_head *prefs, u64 *total_refs, u64 inum) { + struct btrfs_delayed_ref_node *node; struct btrfs_delayed_extent_op *extent_op = head->extent_op; - struct rb_node *n = &head->node.rb_node; struct btrfs_key key; struct btrfs_key op_key = {0}; int sgn; @@ -583,12 +622,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, btrfs_disk_key_to_cpu(&op_key, &extent_op->key); spin_lock(&head->lock); - n = rb_first(&head->ref_root); - while (n) { - struct btrfs_delayed_ref_node *node; - node = rb_entry(n, struct btrfs_delayed_ref_node, - rb_node); - n = rb_next(n); + list_for_each_entry(node, &head->ref_list, list) { if (node->seq > seq) continue; @@ -621,7 +655,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, struct btrfs_delayed_tree_ref *ref; ref = btrfs_delayed_node_to_tree_ref(node); - ret = __add_prelim_ref(prefs, ref->root, NULL, + ret = __add_prelim_ref(prefs, 0, NULL, ref->level + 1, ref->parent, node->bytenr, node->ref_mod * sgn, GFP_ATOMIC); @@ -653,11 +687,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, struct btrfs_delayed_data_ref *ref; ref = btrfs_delayed_node_to_data_ref(node); - - key.objectid = ref->objectid; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = ref->offset; - ret = __add_prelim_ref(prefs, ref->root, &key, 0, + ret = __add_prelim_ref(prefs, 0, NULL, 0, ref->parent, node->bytenr, node->ref_mod * sgn, GFP_ATOMIC); break; @@ -882,6 +912,11 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, * * NOTE: This can return values > 0 * + * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave + * much like trans == NULL case, the difference only lies in it will not + * commit root. + * The special case is for qgroup to search roots in commit_transaction(). + * * FIXME some caching might speed things up */ static int find_parent_nodes(struct btrfs_trans_handle *trans, @@ -920,6 +955,9 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path->skip_locking = 1; } + if (time_seq == (u64)-1) + path->skip_locking = 1; + /* * grab both a lock on the path and a lock on the delayed ref head. * We need both to get a consistent picture of how the refs look @@ -934,9 +972,10 @@ again: BUG_ON(ret == 0); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (trans && likely(trans->type != __TRANS_DUMMY)) { + if (trans && likely(trans->type != __TRANS_DUMMY) && + time_seq != (u64)-1) { #else - if (trans) { + if (trans && time_seq != (u64)-1) { #endif /* * look if there are updates for this ref queued and lock the @@ -1034,7 +1073,10 @@ again: eb = read_tree_block(fs_info->extent_root, ref->parent, 0); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); ret = -EIO; goto out; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 0ef5cc13fae2..81220b2203c6 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -44,6 +44,8 @@ #define BTRFS_INODE_IN_DELALLOC_LIST 9 #define BTRFS_INODE_READDIO_NEED_LOCK 10 #define BTRFS_INODE_HAS_PROPS 11 +/* DIO is ready to submit */ +#define BTRFS_INODE_DIO_READY 12 /* * The following 3 bits are meant only for the btree inode. * When any of them is set, it means an error happened while writing an diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index ce7dec88f4b8..541fbfaed276 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -343,7 +343,7 @@ static int btrfsic_process_written_superblock( struct btrfsic_state *state, struct btrfsic_block *const block, struct btrfs_super_block *const super_hdr); -static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status); +static void btrfsic_bio_end_io(struct bio *bp); static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate); static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, const struct btrfsic_block *block, @@ -2207,7 +2207,7 @@ continue_loop: goto again; } -static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) +static void btrfsic_bio_end_io(struct bio *bp) { struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private; int iodone_w_error; @@ -2215,7 +2215,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) /* mutex is not held! This is not save if IO is not yet completed * on umount */ iodone_w_error = 0; - if (bio_error_status) + if (bp->bi_error) iodone_w_error = 1; BUG_ON(NULL == block); @@ -2230,7 +2230,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) printk(KERN_INFO "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", - bio_error_status, + bp->bi_error, btrfsic_get_block_type(dev_state->state, block), block->logical_bytenr, dev_state->name, block->dev_bytenr, block->mirror_num); @@ -2252,7 +2252,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) block = next_block; } while (NULL != block); - bp->bi_end_io(bp, bio_error_status); + bp->bi_end_io(bp); } static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ce62324c78e7..57ee8ca29b06 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -97,10 +97,7 @@ static inline int compressed_bio_size(struct btrfs_root *root, static struct bio *compressed_bio_alloc(struct block_device *bdev, u64 first_byte, gfp_t gfp_flags) { - int nr_vecs; - - nr_vecs = bio_get_nr_vecs(bdev); - return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags); + return btrfs_bio_alloc(bdev, first_byte >> 9, BIO_MAX_PAGES, gfp_flags); } static int check_compressed_csum(struct inode *inode, @@ -152,7 +149,7 @@ fail: * The compressed pages are freed here, and it must be run * in process context */ -static void end_compressed_bio_read(struct bio *bio, int err) +static void end_compressed_bio_read(struct bio *bio) { struct compressed_bio *cb = bio->bi_private; struct inode *inode; @@ -160,7 +157,7 @@ static void end_compressed_bio_read(struct bio *bio, int err) unsigned long index; int ret; - if (err) + if (bio->bi_error) cb->errors = 1; /* if there are more bios still pending for this compressed @@ -210,7 +207,7 @@ csum_failed: bio_for_each_segment_all(bvec, cb->orig_bio, i) SetPageChecked(bvec->bv_page); - bio_endio(cb->orig_bio, 0); + bio_endio(cb->orig_bio); } /* finally free the cb struct */ @@ -266,7 +263,7 @@ static noinline void end_compressed_writeback(struct inode *inode, * This also calls the writeback end hooks for the file pages so that * metadata and checksums can be updated in the file. */ -static void end_compressed_bio_write(struct bio *bio, int err) +static void end_compressed_bio_write(struct bio *bio) { struct extent_io_tree *tree; struct compressed_bio *cb = bio->bi_private; @@ -274,7 +271,7 @@ static void end_compressed_bio_write(struct bio *bio, int err) struct page *page; unsigned long index; - if (err) + if (bio->bi_error) cb->errors = 1; /* if there are more bios still pending for this compressed @@ -293,7 +290,7 @@ static void end_compressed_bio_write(struct bio *bio, int err) cb->start, cb->start + cb->len - 1, NULL, - err ? 0 : 1); + bio->bi_error ? 0 : 1); cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); @@ -697,8 +694,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - if (ret) - bio_endio(comp_bio, ret); + if (ret) { + bio->bi_error = ret; + bio_endio(comp_bio); + } bio_put(comp_bio); @@ -724,8 +723,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, } ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - if (ret) - bio_endio(comp_bio, ret); + if (ret) { + bio->bi_error = ret; + bio_endio(comp_bio); + } bio_put(comp_bio); return 0; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0f11ebc92f02..5f745eadf77d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1159,8 +1159,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); return ret; + } } if (buf == root->node) { @@ -1439,8 +1441,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq) btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); old = read_tree_block(root, logical, 0); - if (WARN_ON(!old || !extent_buffer_uptodate(old))) { - free_extent_buffer(old); + if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { + if (!IS_ERR(old)) + free_extent_buffer(old); btrfs_warn(root->fs_info, "failed to read tree block %llu from get_old_root", logical); } else { @@ -1685,7 +1688,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (!cur || !uptodate) { if (!cur) { cur = read_tree_block(root, blocknr, gen); - if (!cur || !extent_buffer_uptodate(cur)) { + if (IS_ERR(cur)) { + return PTR_ERR(cur); + } else if (!extent_buffer_uptodate(cur)) { free_extent_buffer(cur); return -EIO; } @@ -1864,8 +1869,9 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), btrfs_node_ptr_generation(parent, slot)); - if (eb && !extent_buffer_uptodate(eb)) { - free_extent_buffer(eb); + if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) { + if (!IS_ERR(eb)) + free_extent_buffer(eb); eb = NULL; } @@ -2494,7 +2500,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, ret = -EAGAIN; tmp = read_tree_block(root, blocknr, 0); - if (tmp) { + if (!IS_ERR(tmp)) { /* * If the read above didn't mark this buffer up to date, * it will never end up being up to date. Set ret to EIO now diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6f364e1d8d3d..938efe33be80 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -174,7 +174,7 @@ struct btrfs_ordered_sum; /* csum types */ #define BTRFS_CSUM_TYPE_CRC32 0 -static int btrfs_csum_sizes[] = { 4, 0 }; +static int btrfs_csum_sizes[] = { 4 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 @@ -1300,7 +1300,7 @@ struct btrfs_block_group_cache { /* for raid56, this is a full stripe, without parity */ unsigned long full_stripe_len; - unsigned int ro:1; + unsigned int ro; unsigned int iref:1; unsigned int has_caching_ctl:1; unsigned int removed:1; @@ -1518,12 +1518,6 @@ struct btrfs_fs_info { */ struct mutex ordered_operations_mutex; - /* - * Same as ordered_operations_mutex except this is for ordered extents - * and not the operations. - */ - struct mutex ordered_extent_flush_mutex; - struct rw_semaphore commit_root_sem; struct rw_semaphore cleanup_work_sem; @@ -1619,10 +1613,7 @@ struct btrfs_fs_info { struct task_struct *cleaner_kthread; int thread_pool_size; - struct kobject super_kobj; struct kobject *space_info_kobj; - struct kobject *device_dir_kobj; - struct completion kobj_unregister; int do_barriers; int closing; int log_root_recovering; @@ -1698,6 +1689,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *scrub_workers; struct btrfs_workqueue *scrub_wr_completion_workers; struct btrfs_workqueue *scrub_nocow_workers; + struct btrfs_workqueue *scrub_parity_workers; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY u32 check_integrity_print_mask; @@ -1735,7 +1727,7 @@ struct btrfs_fs_info { /* list of dirty qgroups to be written at next commit */ struct list_head dirty_qgroups; - /* used by btrfs_qgroup_record_ref for an efficient tree traversal */ + /* used by qgroup for an efficient tree traversal */ u64 qgroup_seq; /* qgroup rescan items */ @@ -1780,6 +1772,7 @@ struct btrfs_fs_info { spinlock_t unused_bgs_lock; struct list_head unused_bgs; struct mutex unused_bg_unpin_mutex; + struct mutex delete_unused_bgs_mutex; /* For btrfs to record security options */ struct security_mnt_opts security_opts; @@ -3438,6 +3431,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start, struct extent_map *em); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); +void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache); +void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); @@ -3458,6 +3453,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes) void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_release_metadata(struct inode *inode); @@ -3495,9 +3491,9 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, void btrfs_block_rsv_release(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); -int btrfs_set_block_group_ro(struct btrfs_root *root, +int btrfs_inc_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache); -void btrfs_set_block_group_rw(struct btrfs_root *root, +void btrfs_dec_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); @@ -3515,6 +3511,9 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, int __get_raid_index(u64 flags); int btrfs_start_write_no_snapshoting(struct btrfs_root *root); void btrfs_end_write_no_snapshoting(struct btrfs_root *root); +void check_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const u64 type); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -4050,6 +4049,7 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) #ifdef CONFIG_BTRFS_ASSERT +__cold static inline void assfail(char *expr, char *file, int line) { pr_err("BTRFS: assertion failed: %s, file: %s, line: %d", @@ -4065,10 +4065,13 @@ static inline void assfail(char *expr, char *file, int line) #define btrfs_assert() __printf(5, 6) +__cold void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); +const char *btrfs_decode_error(int errno); +__cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno); @@ -4111,11 +4114,17 @@ static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact line number is reported. */ - #define btrfs_abort_transaction(trans, root, errno) \ do { \ - __btrfs_abort_transaction(trans, root, __func__, \ - __LINE__, errno); \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((root)->fs_info->fs_state))) { \ + WARN(1, KERN_DEBUG \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno)); \ + } \ + __btrfs_abort_transaction((trans), (root), __func__, \ + __LINE__, (errno)); \ } while (0) #define btrfs_std_error(fs_info, errno) \ @@ -4132,6 +4141,7 @@ do { \ } while (0) __printf(5, 6) +__cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); @@ -4172,8 +4182,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *cow); -void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending, +void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve); int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 8f8ed7d20bac..ac3e81da6d4e 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -22,6 +22,7 @@ #include "ctree.h" #include "delayed-ref.h" #include "transaction.h" +#include "qgroup.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; struct kmem_cache *btrfs_delayed_tree_ref_cachep; @@ -84,87 +85,6 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, return 0; } -/* - * entries in the rb tree are ordered by the byte number of the extent, - * type of the delayed backrefs and content of delayed backrefs. - */ -static int comp_entry(struct btrfs_delayed_ref_node *ref2, - struct btrfs_delayed_ref_node *ref1, - bool compare_seq) -{ - if (ref1->bytenr < ref2->bytenr) - return -1; - if (ref1->bytenr > ref2->bytenr) - return 1; - if (ref1->is_head && ref2->is_head) - return 0; - if (ref2->is_head) - return -1; - if (ref1->is_head) - return 1; - if (ref1->type < ref2->type) - return -1; - if (ref1->type > ref2->type) - return 1; - if (ref1->no_quota > ref2->no_quota) - return 1; - if (ref1->no_quota < ref2->no_quota) - return -1; - /* merging of sequenced refs is not allowed */ - if (compare_seq) { - if (ref1->seq < ref2->seq) - return -1; - if (ref1->seq > ref2->seq) - return 1; - } - if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || - ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { - return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), - btrfs_delayed_node_to_tree_ref(ref1), - ref1->type); - } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY || - ref1->type == BTRFS_SHARED_DATA_REF_KEY) { - return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2), - btrfs_delayed_node_to_data_ref(ref1)); - } - BUG(); - return 0; -} - -/* - * insert a new ref into the rbtree. This returns any existing refs - * for the same (bytenr,parent) tuple, or NULL if the new node was properly - * inserted. - */ -static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, - struct rb_node *node) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent_node = NULL; - struct btrfs_delayed_ref_node *entry; - struct btrfs_delayed_ref_node *ins; - int cmp; - - ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, - rb_node); - - cmp = comp_entry(entry, ins, 1); - if (cmp < 0) - p = &(*p)->rb_left; - else if (cmp > 0) - p = &(*p)->rb_right; - else - return entry; - } - - rb_link_node(node, parent_node, p); - rb_insert_color(node, root); - return NULL; -} - /* insert a new ref to head ref rbtree */ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, struct rb_node *node) @@ -268,7 +188,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, rb_erase(&head->href_node, &delayed_refs->href_root); } else { assert_spin_locked(&head->lock); - rb_erase(&ref->rb_node, &head->ref_root); + list_del(&ref->list); } ref->in_tree = 0; btrfs_put_delayed_ref(ref); @@ -277,99 +197,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, trans->delayed_ref_updates--; } -static int merge_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head, - struct btrfs_delayed_ref_node *ref, u64 seq) -{ - struct rb_node *node; - int mod = 0; - int done = 0; - - node = rb_next(&ref->rb_node); - while (!done && node) { - struct btrfs_delayed_ref_node *next; - - next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - node = rb_next(node); - if (seq && next->seq >= seq) - break; - if (comp_entry(ref, next, 0)) - continue; - - if (ref->action == next->action) { - mod = next->ref_mod; - } else { - if (ref->ref_mod < next->ref_mod) { - struct btrfs_delayed_ref_node *tmp; - - tmp = ref; - ref = next; - next = tmp; - done = 1; - } - mod = -next->ref_mod; - } - - drop_delayed_ref(trans, delayed_refs, head, next); - ref->ref_mod += mod; - if (ref->ref_mod == 0) { - drop_delayed_ref(trans, delayed_refs, head, ref); - done = 1; - } else { - /* - * You can't have multiples of the same ref on a tree - * block. - */ - WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || - ref->type == BTRFS_SHARED_BLOCK_REF_KEY); - } - } - return done; -} - -void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head) -{ - struct rb_node *node; - u64 seq = 0; - - assert_spin_locked(&head->lock); - /* - * We don't have too much refs to merge in the case of delayed data - * refs. - */ - if (head->is_data) - return; - - spin_lock(&fs_info->tree_mod_seq_lock); - if (!list_empty(&fs_info->tree_mod_seq_list)) { - struct seq_list *elem; - - elem = list_first_entry(&fs_info->tree_mod_seq_list, - struct seq_list, list); - seq = elem->seq; - } - spin_unlock(&fs_info->tree_mod_seq_lock); - - node = rb_first(&head->ref_root); - while (node) { - struct btrfs_delayed_ref_node *ref; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - /* We can't merge refs that are outside of our seq count */ - if (seq && ref->seq >= seq) - break; - if (merge_ref(trans, delayed_refs, head, ref, seq)) - node = rb_first(&head->ref_root); - else - node = rb_next(&ref->rb_node); - } -} - int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, u64 seq) @@ -443,45 +270,71 @@ again: } /* - * helper function to update an extent delayed ref in the - * rbtree. existing and update must both have the same - * bytenr and parent + * Helper to insert the ref_node to the tail or merge with tail. * - * This may free existing if the update cancels out whatever - * operation it was doing. + * Return 0 for insert. + * Return >0 for merge. */ -static noinline void -update_existing_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head, - struct btrfs_delayed_ref_node *existing, - struct btrfs_delayed_ref_node *update) +static int +add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *root, + struct btrfs_delayed_ref_head *href, + struct btrfs_delayed_ref_node *ref) { - if (update->action != existing->action) { - /* - * this is effectively undoing either an add or a - * drop. We decrement the ref_mod, and if it goes - * down to zero we just delete the entry without - * every changing the extent allocation tree. - */ - existing->ref_mod--; - if (existing->ref_mod == 0) - drop_delayed_ref(trans, delayed_refs, head, existing); - else - WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || - existing->type == BTRFS_SHARED_BLOCK_REF_KEY); + struct btrfs_delayed_ref_node *exist; + int mod; + int ret = 0; + + spin_lock(&href->lock); + /* Check whether we can merge the tail node with ref */ + if (list_empty(&href->ref_list)) + goto add_tail; + exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node, + list); + /* No need to compare bytenr nor is_head */ + if (exist->type != ref->type || exist->no_quota != ref->no_quota || + exist->seq != ref->seq) + goto add_tail; + + if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY || + exist->type == BTRFS_SHARED_BLOCK_REF_KEY) && + comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist), + btrfs_delayed_node_to_tree_ref(ref), + ref->type)) + goto add_tail; + if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY || + exist->type == BTRFS_SHARED_DATA_REF_KEY) && + comp_data_refs(btrfs_delayed_node_to_data_ref(exist), + btrfs_delayed_node_to_data_ref(ref))) + goto add_tail; + + /* Now we are sure we can merge */ + ret = 1; + if (exist->action == ref->action) { + mod = ref->ref_mod; } else { - WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || - existing->type == BTRFS_SHARED_BLOCK_REF_KEY); - /* - * the action on the existing ref matches - * the action on the ref we're trying to add. - * Bump the ref_mod by one so the backref that - * is eventually added/removed has the correct - * reference count - */ - existing->ref_mod += update->ref_mod; + /* Need to change action */ + if (exist->ref_mod < ref->ref_mod) { + exist->action = ref->action; + mod = -exist->ref_mod; + exist->ref_mod = ref->ref_mod; + } else + mod = -ref->ref_mod; } + exist->ref_mod += mod; + + /* remove existing tail if its ref_mod is zero */ + if (exist->ref_mod == 0) + drop_delayed_ref(trans, root, href, exist); + spin_unlock(&href->lock); + return ret; + +add_tail: + list_add_tail(&ref->list, &href->ref_list); + atomic_inc(&root->num_entries); + trans->delayed_ref_updates++; + spin_unlock(&href->lock); + return ret; } /* @@ -568,12 +421,14 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, u64 bytenr, - u64 num_bytes, int action, int is_data) + struct btrfs_delayed_ref_node *ref, + struct btrfs_qgroup_extent_record *qrecord, + u64 bytenr, u64 num_bytes, int action, int is_data) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_qgroup_extent_record *qexisting; int count_mod = 1; int must_insert_reserved = 0; @@ -618,10 +473,22 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, head_ref = btrfs_delayed_node_to_head(ref); head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; - head_ref->ref_root = RB_ROOT; + INIT_LIST_HEAD(&head_ref->ref_list); head_ref->processing = 0; head_ref->total_ref_mod = count_mod; + /* Record qgroup extent info if provided */ + if (qrecord) { + qrecord->bytenr = bytenr; + qrecord->num_bytes = num_bytes; + qrecord->old_roots = NULL; + + qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs, + qrecord); + if (qexisting) + kfree(qrecord); + } + spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); @@ -659,10 +526,10 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, int no_quota) { - struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_tree_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; u64 seq = 0; + int ret; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; @@ -693,21 +560,14 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_tree_ref(ref, full_ref, action); - spin_lock(&head_ref->lock); - existing = tree_insert(&head_ref->ref_root, &ref->rb_node); - if (existing) { - update_existing_ref(trans, delayed_refs, head_ref, existing, - ref); - /* - * we've updated the existing ref, free the newly - * allocated ref - */ + ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); + + /* + * XXX: memory should be freed at the same level allocated. + * But bad practice is anywhere... Follow it now. Need cleanup. + */ + if (ret > 0) kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); - } else { - atomic_inc(&delayed_refs->num_entries); - trans->delayed_ref_updates++; - } - spin_unlock(&head_ref->lock); } /* @@ -721,10 +581,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, int no_quota) { - struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_data_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; u64 seq = 0; + int ret; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; @@ -758,21 +618,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_data_ref(ref, full_ref, action); - spin_lock(&head_ref->lock); - existing = tree_insert(&head_ref->ref_root, &ref->rb_node); - if (existing) { - update_existing_ref(trans, delayed_refs, head_ref, existing, - ref); - /* - * we've updated the existing ref, free the newly - * allocated ref - */ + ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); + + if (ret > 0) kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); - } else { - atomic_inc(&delayed_refs->num_entries); - trans->delayed_ref_updates++; - } - spin_unlock(&head_ref->lock); } /* @@ -790,6 +639,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_qgroup_extent_record *record = NULL; if (!is_fstree(ref_root) || !fs_info->quota_enabled) no_quota = 0; @@ -800,9 +650,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, return -ENOMEM; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); - if (!head_ref) { - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); - return -ENOMEM; + if (!head_ref) + goto free_ref; + + if (fs_info->quota_enabled && is_fstree(ref_root)) { + record = kmalloc(sizeof(*record), GFP_NOFS); + if (!record) + goto free_head_ref; } head_ref->extent_op = extent_op; @@ -814,7 +668,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, bytenr, num_bytes, action, 0); add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, @@ -823,6 +677,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, spin_unlock(&delayed_refs->lock); return 0; + +free_head_ref: + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); +free_ref: + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + + return -ENOMEM; } /* @@ -839,6 +700,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_qgroup_extent_record *record = NULL; if (!is_fstree(ref_root) || !fs_info->quota_enabled) no_quota = 0; @@ -854,6 +716,16 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, return -ENOMEM; } + if (fs_info->quota_enabled && is_fstree(ref_root)) { + record = kmalloc(sizeof(*record), GFP_NOFS); + if (!record) { + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + kmem_cache_free(btrfs_delayed_ref_head_cachep, + head_ref); + return -ENOMEM; + } + } + head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; @@ -863,7 +735,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, bytenr, num_bytes, action, 1); add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, @@ -891,9 +763,9 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, - num_bytes, BTRFS_UPDATE_DELAYED_HEAD, - extent_op->is_data); + add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, + num_bytes, BTRFS_UPDATE_DELAYED_HEAD, + extent_op->is_data); spin_unlock(&delayed_refs->lock); return 0; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 5eb0892396d0..13fb5e6090fe 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -24,9 +24,25 @@ #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ #define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ +/* + * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the + * same ref_node structure. + * Ref_head is in a higher logic level than tree/data ref, and duplicated + * bytenr/num_bytes in ref_node is really a waste or memory, they should be + * referred from ref_head. + * This gets more disgusting after we use list to store tree/data ref in + * ref_head. Must clean this mess up later. + */ struct btrfs_delayed_ref_node { + /* + * ref_head use rb tree, stored in ref_root->href. + * indexed by bytenr + */ struct rb_node rb_node; + /*data/tree ref use list, stored in ref_head->ref_list. */ + struct list_head list; + /* the starting bytenr of the extent */ u64 bytenr; @@ -83,7 +99,7 @@ struct btrfs_delayed_ref_head { struct mutex mutex; spinlock_t lock; - struct rb_root ref_root; + struct list_head ref_list; struct rb_node href_node; @@ -132,6 +148,9 @@ struct btrfs_delayed_ref_root { /* head ref rbtree */ struct rb_root href_root; + /* dirty extent records */ + struct rb_root dirty_extent_root; + /* this spin lock protects the rbtree and the entries inside */ spinlock_t lock; @@ -156,6 +175,14 @@ struct btrfs_delayed_ref_root { int flushing; u64 run_delayed_start; + + /* + * To make qgroup to skip given root. + * This is for snapshot, as btrfs_qgroup_inherit() will manully + * modify counters for snapshot and its source, so we should skip + * the snapshot in new_root/old_roots or it will get calculated twice + */ + u64 qgroup_to_skip; }; extern struct kmem_cache *btrfs_delayed_ref_head_cachep; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 0573848c7333..564a7de17d99 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -376,6 +376,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root, WARN_ON(!tgt_device); dev_replace->tgtdev = tgt_device; + ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device); + if (ret) + btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); + printk_in_rcu(KERN_INFO "BTRFS: dev_replace from %s (devid %llu) to %s started\n", src_device->missing ? "<missing disk>" : @@ -583,8 +587,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&uuid_mutex); /* replace the sysfs entry */ - btrfs_kobj_rm_device(fs_info, src_device); - btrfs_kobj_add_device(fs_info, tgt_device); + btrfs_kobj_rm_device(fs_info->fs_devices, src_device); btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); /* write back the superblocks */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0bccf18dc1dc..9ebd34f1c677 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -703,7 +703,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) return -EIO; /* we fixed nothing */ } -static void end_workqueue_bio(struct bio *bio, int err) +static void end_workqueue_bio(struct bio *bio) { struct btrfs_end_io_wq *end_io_wq = bio->bi_private; struct btrfs_fs_info *fs_info; @@ -711,7 +711,7 @@ static void end_workqueue_bio(struct bio *bio, int err) btrfs_work_func_t func; fs_info = end_io_wq->info; - end_io_wq->error = err; + end_io_wq->error = bio->bi_error; if (bio->bi_rw & REQ_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { @@ -808,7 +808,8 @@ static void run_one_async_done(struct btrfs_work *work) /* If an error occured we just want to clean up the bio and move on */ if (async->error) { - bio_endio(async->bio, async->error); + async->bio->bi_error = async->error; + bio_endio(async->bio); return; } @@ -908,8 +909,10 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, * submission context. Just jump into btrfs_map_bio */ ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); - if (ret) - bio_endio(bio, ret); + if (ret) { + bio->bi_error = ret; + bio_endio(bio); + } return ret; } @@ -960,10 +963,13 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, __btree_submit_bio_done); } - if (ret) { + if (ret) + goto out_w_error; + return 0; + out_w_error: - bio_endio(bio, ret); - } + bio->bi_error = ret; + bio_endio(bio); return ret; } @@ -1149,12 +1155,12 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) - return NULL; + return ERR_PTR(-ENOMEM); ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); if (ret) { free_extent_buffer(buf); - return NULL; + return ERR_PTR(ret); } return buf; @@ -1509,20 +1515,19 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, generation = btrfs_root_generation(&root->root_item); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), generation); - if (!root->node) { - ret = -ENOMEM; + if (IS_ERR(root->node)) { + ret = PTR_ERR(root->node); goto find_fail; } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { ret = -EIO; - goto read_fail; + free_extent_buffer(root->node); + goto find_fail; } root->commit_root = btrfs_root_node(root); out: btrfs_free_path(path); return root; -read_fail: - free_extent_buffer(root->node); find_fail: kfree(root); alloc_fail: @@ -1725,6 +1730,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE; bdi->congested_fn = btrfs_congested_fn; bdi->congested_data = info; + bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; return 0; } @@ -1736,22 +1742,22 @@ static void end_workqueue_fn(struct btrfs_work *work) { struct bio *bio; struct btrfs_end_io_wq *end_io_wq; - int error; end_io_wq = container_of(work, struct btrfs_end_io_wq, work); bio = end_io_wq->bio; - error = end_io_wq->error; + bio->bi_error = end_io_wq->error; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); - bio_endio(bio, error); + bio_endio(bio); } static int cleaner_kthread(void *arg) { struct btrfs_root *root = arg; int again; + struct btrfs_trans_handle *trans; do { again = 0; @@ -1773,7 +1779,6 @@ static int cleaner_kthread(void *arg) } btrfs_run_delayed_iputs(root); - btrfs_delete_unused_bgs(root->fs_info); again = btrfs_clean_one_deleted_snapshot(root); mutex_unlock(&root->fs_info->cleaner_mutex); @@ -1782,6 +1787,16 @@ static int cleaner_kthread(void *arg) * needn't do anything special here. */ btrfs_run_defrag_inodes(root->fs_info); + + /* + * Acquires fs_info->delete_unused_bgs_mutex to avoid racing + * with relocation (btrfs_relocate_chunk) and relocation + * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group) + * after acquiring fs_info->delete_unused_bgs_mutex. So we + * can't hold, nor need to, fs_info->cleaner_mutex when deleting + * unused block groups. + */ + btrfs_delete_unused_bgs(root->fs_info); sleep: if (!try_to_freeze() && !again) { set_current_state(TASK_INTERRUPTIBLE); @@ -1790,6 +1805,34 @@ sleep: __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); + + /* + * Transaction kthread is stopped before us and wakes us up. + * However we might have started a new transaction and COWed some + * tree blocks when deleting unused block groups for example. So + * make sure we commit the transaction we started to have a clean + * shutdown when evicting the btree inode - if it has dirty pages + * when we do the final iput() on it, eviction will trigger a + * writeback for it which will fail with null pointer dereferences + * since work queues and other resources were already released and + * destroyed by the time the iput/eviction/writeback is made. + */ + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT) + btrfs_err(root->fs_info, + "cleaner transaction attach returned %ld", + PTR_ERR(trans)); + } else { + int ret; + + ret = btrfs_commit_transaction(trans, root); + if (ret) + btrfs_err(root->fs_info, + "cleaner open transaction commit returned %d", + ret); + } + return 0; } @@ -2320,8 +2363,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, log_tree_root->node = read_tree_block(tree_root, bytenr, fs_info->generation + 1); - if (!log_tree_root->node || - !extent_buffer_uptodate(log_tree_root->node)) { + if (IS_ERR(log_tree_root->node)) { + printk(KERN_ERR "BTRFS: failed to read log tree\n"); + ret = PTR_ERR(log_tree_root->node); + kfree(log_tree_root); + return ret; + } else if (!extent_buffer_uptodate(log_tree_root->node)) { printk(KERN_ERR "BTRFS: failed to read log tree\n"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); @@ -2489,12 +2536,12 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->unused_bgs_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); + mutex_init(&fs_info->delete_unused_bgs_mutex); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); seqlock_init(&fs_info->profiles_lock); init_rwsem(&fs_info->delayed_iput_sem); - init_completion(&fs_info->kobj_unregister); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); @@ -2567,7 +2614,6 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->ordered_operations_mutex); - mutex_init(&fs_info->ordered_extent_flush_mutex); mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); @@ -2797,10 +2843,11 @@ int open_ctree(struct super_block *sb, chunk_root->node = read_tree_block(chunk_root, btrfs_super_chunk_root(disk_super), generation); - if (!chunk_root->node || - !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { + if (IS_ERR(chunk_root->node) || + !extent_buffer_uptodate(chunk_root->node)) { printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", sb->s_id); + chunk_root->node = NULL; goto fail_tree_roots; } btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); @@ -2834,11 +2881,11 @@ retry_root_backup: tree_root->node = read_tree_block(tree_root, btrfs_super_root(disk_super), generation); - if (!tree_root->node || - !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { + if (IS_ERR(tree_root->node) || + !extent_buffer_uptodate(tree_root->node)) { printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", sb->s_id); - + tree_root->node = NULL; goto recovery_tree_root; } @@ -2874,10 +2921,22 @@ retry_root_backup: btrfs_close_extra_devices(fs_devices, 1); + ret = btrfs_sysfs_add_fsid(fs_devices, NULL); + if (ret) { + pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret); + goto fail_block_groups; + } + + ret = btrfs_sysfs_add_device(fs_devices); + if (ret) { + pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret); + goto fail_fsdev_sysfs; + } + ret = btrfs_sysfs_add_one(fs_info); if (ret) { pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); - goto fail_block_groups; + goto fail_fsdev_sysfs; } ret = btrfs_init_space_info(fs_info); @@ -2896,8 +2955,9 @@ retry_root_backup: if (fs_info->fs_devices->missing_devices > fs_info->num_tolerated_disk_barrier_failures && !(sb->s_flags & MS_RDONLY)) { - printk(KERN_WARNING "BTRFS: " - "too many missing devices, writeable mount is not allowed\n"); + pr_warn("BTRFS: missing devices(%llu) exceeds the limit(%d), writeable mount is not allowed\n", + fs_info->fs_devices->missing_devices, + fs_info->num_tolerated_disk_barrier_failures); goto fail_sysfs; } @@ -3055,6 +3115,9 @@ fail_cleaner: fail_sysfs: btrfs_sysfs_remove_one(fs_info); +fail_fsdev_sysfs: + btrfs_sysfs_remove_fsid(fs_info->fs_devices); + fail_block_groups: btrfs_put_block_group_cache(fs_info); btrfs_free_block_groups(fs_info); @@ -3267,10 +3330,8 @@ static int write_dev_supers(struct btrfs_device *device, * endio for the write_dev_flush, this will wake anyone waiting * for the barrier when it is done */ -static void btrfs_end_empty_barrier(struct bio *bio, int err) +static void btrfs_end_empty_barrier(struct bio *bio) { - if (err) - clear_bit(BIO_UPTODATE, &bio->bi_flags); if (bio->bi_private) complete(bio->bi_private); bio_put(bio); @@ -3298,8 +3359,8 @@ static int write_dev_flush(struct btrfs_device *device, int wait) wait_for_completion(&device->flush_wait); - if (!bio_flagged(bio, BIO_UPTODATE)) { - ret = -EIO; + if (bio->bi_error) { + ret = bio->bi_error; btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); } @@ -3703,6 +3764,15 @@ void close_ctree(struct btrfs_root *root) cancel_work_sync(&fs_info->async_reclaim_work); if (!(fs_info->sb->s_flags & MS_RDONLY)) { + /* + * If the cleaner thread is stopped and there are + * block groups queued for removal, the deletion will be + * skipped when we quit the cleaner thread. + */ + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_delete_unused_bgs(root->fs_info); + mutex_unlock(&root->fs_info->cleaner_mutex); + ret = btrfs_commit_super(root); if (ret) btrfs_err(fs_info, "commit super ret %d", ret); @@ -3725,6 +3795,7 @@ void close_ctree(struct btrfs_root *root) } btrfs_sysfs_remove_one(fs_info); + btrfs_sysfs_remove_fsid(fs_info->fs_devices); btrfs_free_fs_roots(fs_info); @@ -4053,6 +4124,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, while ((node = rb_first(&delayed_refs->href_root)) != NULL) { struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_node *tmp; bool pin_bytes = false; head = rb_entry(node, struct btrfs_delayed_ref_head, @@ -4068,11 +4140,10 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, continue; } spin_lock(&head->lock); - while ((node = rb_first(&head->ref_root)) != NULL) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); + list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list, + list) { ref->in_tree = 0; - rb_erase(&ref->rb_node, &head->ref_root); + list_del(&ref->list); atomic_dec(&delayed_refs->num_entries); btrfs_put_delayed_ref(ref); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0ec3acd14cbf..5411f0ab5683 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -79,11 +79,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, + struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extra_op, - int no_quota); + struct btrfs_delayed_extent_op *extra_op); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, struct btrfs_extent_item *ei); @@ -1317,8 +1316,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, return ret; } -static noinline u32 extent_data_ref_count(struct btrfs_root *root, - struct btrfs_path *path, +static noinline u32 extent_data_ref_count(struct btrfs_path *path, struct btrfs_extent_inline_ref *iref) { struct btrfs_key key; @@ -1884,10 +1882,77 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, return ret; } -static int btrfs_issue_discard(struct block_device *bdev, - u64 start, u64 len) +#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) +static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, + u64 *discarded_bytes) { - return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); + int j, ret = 0; + u64 bytes_left, end; + u64 aligned_start = ALIGN(start, 1 << 9); + + if (WARN_ON(start != aligned_start)) { + len -= aligned_start - start; + len = round_down(len, 1 << 9); + start = aligned_start; + } + + *discarded_bytes = 0; + + if (!len) + return 0; + + end = start + len; + bytes_left = len; + + /* Skip any superblocks on this device. */ + for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { + u64 sb_start = btrfs_sb_offset(j); + u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; + u64 size = sb_start - start; + + if (!in_range(sb_start, start, bytes_left) && + !in_range(sb_end, start, bytes_left) && + !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) + continue; + + /* + * Superblock spans beginning of range. Adjust start and + * try again. + */ + if (sb_start <= start) { + start += sb_end - start; + if (start > end) { + bytes_left = 0; + break; + } + bytes_left = end - start; + continue; + } + + if (size) { + ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, + GFP_NOFS, 0); + if (!ret) + *discarded_bytes += size; + else if (ret != -EOPNOTSUPP) + return ret; + } + + start = sb_end; + if (start > end) { + bytes_left = 0; + break; + } + bytes_left = end - start; + } + + if (bytes_left) { + ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, + GFP_NOFS, 0); + if (!ret) + *discarded_bytes += bytes_left; + } + return ret; } int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, @@ -1908,14 +1973,16 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, for (i = 0; i < bbio->num_stripes; i++, stripe++) { + u64 bytes; if (!stripe->dev->can_discard) continue; ret = btrfs_issue_discard(stripe->dev->bdev, stripe->physical, - stripe->length); + stripe->length, + &bytes); if (!ret) - discarded_bytes += stripe->length; + discarded_bytes += bytes; else if (ret != -EOPNOTSUPP) break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ @@ -1967,10 +2034,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, + struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add, - int no_quota, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -1978,9 +2044,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_extent_item *item; struct btrfs_key key; + u64 bytenr = node->bytenr; + u64 num_bytes = node->num_bytes; u64 refs; int ret; - enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL; + int no_quota = node->no_quota; path = btrfs_alloc_path(); if (!path) @@ -1996,26 +2064,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, bytenr, num_bytes, parent, root_objectid, owner, offset, refs_to_add, extent_op); - if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota)) + if ((ret < 0 && ret != -EAGAIN) || !ret) goto out; - /* - * Ok we were able to insert an inline extent and it appears to be a new - * reference, deal with the qgroup accounting. - */ - if (!ret && !no_quota) { - ASSERT(root->fs_info->quota_enabled); - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) - type = BTRFS_QGROUP_OPER_ADD_SHARED; - btrfs_release_path(path); - - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - bytenr, num_bytes, type, 0); - goto out; - } /* * Ok we had -EAGAIN which means we didn't have space to insert and @@ -2026,8 +2076,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, item); - if (refs) - type = BTRFS_QGROUP_OPER_ADD_SHARED; btrfs_set_extent_refs(leaf, item, refs + refs_to_add); if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); @@ -2035,13 +2083,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - if (!no_quota) { - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - bytenr, num_bytes, type, 0); - if (ret) - goto out; - } - path->reada = 1; path->leave_spinning = 1; /* now insert the actual backref */ @@ -2087,17 +2128,15 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, ref->objectid, ref->offset, &ins, node->ref_mod); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, - node->num_bytes, parent, + ret = __btrfs_inc_extent_ref(trans, root, node, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - node->no_quota, extent_op); + extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, root, node->bytenr, - node->num_bytes, parent, + ret = __btrfs_free_extent(trans, root, node, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - extent_op, node->no_quota); + extent_op); } else { BUG(); } @@ -2255,15 +2294,14 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, ref->level, &ins, node->no_quota); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, - node->num_bytes, parent, ref_root, - ref->level, 0, 1, node->no_quota, + ret = __btrfs_inc_extent_ref(trans, root, node, + parent, ref_root, + ref->level, 0, 1, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, root, node->bytenr, - node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op, - node->no_quota); + ret = __btrfs_free_extent(trans, root, node, + parent, ref_root, + ref->level, 0, 1, extent_op); } else { BUG(); } @@ -2323,28 +2361,27 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, return ret; } -static noinline struct btrfs_delayed_ref_node * +static inline struct btrfs_delayed_ref_node * select_delayed_ref(struct btrfs_delayed_ref_head *head) { - struct rb_node *node; - struct btrfs_delayed_ref_node *ref, *last = NULL;; + struct btrfs_delayed_ref_node *ref; + + if (list_empty(&head->ref_list)) + return NULL; /* - * select delayed ref of type BTRFS_ADD_DELAYED_REF first. - * this prevents ref count from going down to zero when - * there still are pending delayed ref. + * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. + * This is to prevent a ref count from going down to zero, which deletes + * the extent item from the extent tree, when there still are references + * to add, which would fail because they would not find the extent item. */ - node = rb_first(&head->ref_root); - while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); + list_for_each_entry(ref, &head->ref_list, list) { if (ref->action == BTRFS_ADD_DELAYED_REF) return ref; - else if (last == NULL) - last = ref; - node = rb_next(node); } - return last; + + return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node, + list); } /* @@ -2396,16 +2433,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } } - /* - * We need to try and merge add/drops of the same ref since we - * can run into issues with relocate dropping the implicit ref - * and then it being added back again before the drop can - * finish. If we merged anything we need to re-loop so we can - * get a good ref. - */ spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, - locked_ref); /* * locked_ref is the head node, so we have to go one @@ -2482,7 +2510,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, spin_unlock(&locked_ref->lock); spin_lock(&delayed_refs->lock); spin_lock(&locked_ref->lock); - if (rb_first(&locked_ref->ref_root) || + if (!list_empty(&locked_ref->ref_list) || locked_ref->extent_op) { spin_unlock(&locked_ref->lock); spin_unlock(&delayed_refs->lock); @@ -2496,7 +2524,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } else { actual_count++; ref->in_tree = 0; - rb_erase(&ref->rb_node, &locked_ref->ref_root); + list_del(&ref->list); } atomic_dec(&delayed_refs->num_entries); @@ -2864,9 +2892,6 @@ again: goto again; } out: - ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); - if (ret) - return ret; assert_qgroups_uptodate(trans); return 0; } @@ -2905,7 +2930,6 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_data_ref *data_ref; struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *node; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; @@ -2934,11 +2958,7 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); spin_lock(&head->lock); - node = rb_first(&head->ref_root); - while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - node = rb_next(node); - + list_for_each_entry(ref, &head->ref_list, list) { /* If it's a shared ref we know a cross reference exists */ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { ret = 1; @@ -3693,7 +3713,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->disk_total += total_bytes * factor; found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; - found->full = 0; + if (total_bytes > 0) + found->full = 0; spin_unlock(&found->lock); *space_info = found; return 0; @@ -3721,7 +3742,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_reserved = 0; found->bytes_readonly = 0; found->bytes_may_use = 0; - found->full = 0; + if (total_bytes > 0) + found->full = 0; + else + found->full = 1; found->force_alloc = CHUNK_ALLOC_NO_FORCE; found->chunk_alloc = 0; found->flush = 0; @@ -3975,6 +3999,9 @@ commit_trans: !atomic_read(&root->fs_info->open_ioctl_trans)) { need_commit--; + if (need_commit > 0) + btrfs_wait_ordered_roots(fs_info, -1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -4088,7 +4115,7 @@ static int should_alloc_chunk(struct btrfs_root *root, return 1; } -static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) +static u64 get_profile_num_devs(struct btrfs_root *root, u64 type) { u64 num_dev; @@ -4102,24 +4129,43 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) else num_dev = 1; /* DUP or single */ - /* metadata for updaing devices and chunk tree */ - return btrfs_calc_trans_metadata_size(root, num_dev + 1); + return num_dev; } -static void check_system_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 type) +/* + * If @is_allocation is true, reserve space in the system space info necessary + * for allocating a chunk, otherwise if it's false, reserve space necessary for + * removing a chunk. + */ +void check_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 type) { struct btrfs_space_info *info; u64 left; u64 thresh; + int ret = 0; + u64 num_devs; + + /* + * Needed because we can end up allocating a system chunk and for an + * atomic and race free space reservation in the chunk block reserve. + */ + ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex)); info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); spin_lock(&info->lock); left = info->total_bytes - info->bytes_used - info->bytes_pinned - - info->bytes_reserved - info->bytes_readonly; + info->bytes_reserved - info->bytes_readonly - + info->bytes_may_use; spin_unlock(&info->lock); - thresh = get_system_chunk_thresh(root, type); + num_devs = get_profile_num_devs(root, type); + + /* num_devs device items to update and 1 chunk item to add or remove */ + thresh = btrfs_calc_trunc_metadata_size(root, num_devs) + + btrfs_calc_trans_metadata_size(root, 1); + if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", left, thresh, type); @@ -4130,7 +4176,21 @@ static void check_system_chunk(struct btrfs_trans_handle *trans, u64 flags; flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); - btrfs_alloc_chunk(trans, root, flags); + /* + * Ignore failure to create system chunk. We might end up not + * needing it, as we might not need to COW all nodes/leafs from + * the paths we visit in the chunk tree (they were already COWed + * or created in the current transaction for example). + */ + ret = btrfs_alloc_chunk(trans, root, flags); + } + + if (!ret) { + ret = btrfs_block_rsv_add(root->fs_info->chunk_root, + &root->fs_info->chunk_block_rsv, + thresh, BTRFS_RESERVE_NO_FLUSH); + if (!ret) + trans->chunk_bytes_reserved += thresh; } } @@ -4235,6 +4295,24 @@ out: space_info->chunk_alloc = 0; spin_unlock(&space_info->lock); mutex_unlock(&fs_info->chunk_mutex); + /* + * When we allocate a new chunk we reserve space in the chunk block + * reserve to make sure we can COW nodes/leafs in the chunk tree or + * add new nodes/leafs to it if we end up needing to do it when + * inserting the chunk item and updating device items as part of the + * second phase of chunk allocation, performed by + * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a + * large number of new block groups to create in our transaction + * handle's new_bgs list to avoid exhausting the chunk block reserve + * in extreme cases - like having a single transaction create many new + * block groups when starting to write out the free space caches of all + * the block groups that were made dirty during the lifetime of the + * transaction. + */ + if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) { + btrfs_create_pending_block_groups(trans, trans->root); + btrfs_trans_release_chunk_metadata(trans); + } return ret; } @@ -5188,6 +5266,24 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, trans->bytes_reserved = 0; } +/* + * To be called after all the new block groups attached to the transaction + * handle have been created (btrfs_create_pending_block_groups()). + */ +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->root->fs_info; + + if (!trans->chunk_bytes_reserved) + return; + + WARN_ON_ONCE(!list_empty(&trans->new_bgs)); + + block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, + trans->chunk_bytes_reserved); + trans->chunk_bytes_reserved = 0; +} + /* Can only return 0 or -ENOSPC */ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode) @@ -6034,20 +6130,19 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_group_cache *block_group, *tmp; + struct list_head *deleted_bgs; struct extent_io_tree *unpin; u64 start; u64 end; int ret; - if (trans->aborted) - return 0; - if (fs_info->pinned_extents == &fs_info->freed_extents[0]) unpin = &fs_info->freed_extents[1]; else unpin = &fs_info->freed_extents[0]; - while (1) { + while (!trans->aborted) { mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, NULL); @@ -6066,6 +6161,34 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, cond_resched(); } + /* + * Transaction is finished. We don't need the lock anymore. We + * do need to clean up the block groups in case of a transaction + * abort. + */ + deleted_bgs = &trans->transaction->deleted_bgs; + list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { + u64 trimmed = 0; + + ret = -EROFS; + if (!trans->aborted) + ret = btrfs_discard_extent(root, + block_group->key.objectid, + block_group->key.offset, + &trimmed); + + list_del_init(&block_group->bg_list); + btrfs_put_block_group_trimming(block_group); + btrfs_put_block_group(block_group); + + if (ret) { + const char *errstr = btrfs_decode_error(ret); + btrfs_warn(fs_info, + "Discard failed while removing blockgroup: errno=%d %s\n", + ret, errstr); + } + } + return 0; } @@ -6092,11 +6215,10 @@ static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, + struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extent_op, - int no_quota) + struct btrfs_delayed_extent_op *extent_op) { struct btrfs_key key; struct btrfs_path *path; @@ -6110,10 +6232,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int extent_slot = 0; int found_extent = 0; int num_to_del = 1; + int no_quota = node->no_quota; u32 item_size; u64 refs; + u64 bytenr = node->bytenr; + u64 num_bytes = node->num_bytes; int last_ref = 0; - enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); @@ -6294,7 +6418,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, refs -= refs_to_drop; if (refs > 0) { - type = BTRFS_QGROUP_OPER_SUB_SHARED; if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); /* @@ -6321,7 +6444,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } else { if (found_extent) { BUG_ON(is_data && refs_to_drop != - extent_data_ref_count(root, path, iref)); + extent_data_ref_count(path, iref)); if (iref) { BUG_ON(path->slots[0] != extent_slot); } else { @@ -6356,18 +6479,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - /* Deal with the quota accounting */ - if (!ret && last_ref && !no_quota) { - int mod_seq = 0; - - if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && - type == BTRFS_QGROUP_OPER_SUB_SHARED) - mod_seq = 1; - - ret = btrfs_qgroup_record_ref(trans, info, root_objectid, - bytenr, num_bytes, type, - mod_seq); - } out: btrfs_free_path(path); return ret; @@ -6393,7 +6504,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, goto out_delayed_unlock; spin_lock(&head->lock); - if (rb_first(&head->ref_root)) + if (!list_empty(&head->ref_list)) goto out; if (head->extent_op) { @@ -7303,13 +7414,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - /* Always set parent to 0 here since its exclusive anyway. */ - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - ins->objectid, ins->offset, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); - if (ret) - return ret; - ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", @@ -7391,14 +7495,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - if (!no_quota) { - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - ins->objectid, num_bytes, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); - if (ret) - return ret; - } - ret = update_block_group(trans, root, ins->objectid, root->nodesize, 1); if (ret) { /* -ENOENT, logic error */ @@ -7566,9 +7662,6 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info, /* * finds a free extent and does all the dirty work required for allocation - * returns the key for the extent through ins, and a tree buffer for - * the first block of the extent through buf. - * * returns the tree buffer or an ERR_PTR on error. */ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, @@ -7755,12 +7848,18 @@ reada: wc->reada_slot = slot; } +/* + * TODO: Modify related function to add related node/leaf to dirty_extent_root, + * for later qgroup accounting. + * + * Current, this function does nothing. + */ static int account_leaf_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb) { int nr = btrfs_header_nritems(eb); - int i, extent_type, ret; + int i, extent_type; struct btrfs_key key; struct btrfs_file_extent_item *fi; u64 bytenr, num_bytes; @@ -7783,13 +7882,6 @@ static int account_leaf_items(struct btrfs_trans_handle *trans, continue; num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); - - ret = btrfs_qgroup_record_ref(trans, root->fs_info, - root->objectid, - bytenr, num_bytes, - BTRFS_QGROUP_OPER_SUB_SUBTREE, 0); - if (ret) - return ret; } return 0; } @@ -7858,6 +7950,8 @@ static int adjust_slots_upwards(struct btrfs_root *root, /* * root_eb is the subtree root and is locked before this function is called. + * TODO: Modify this function to mark all (including complete shared node) + * to dirty_extent_root to allow it get accounted in qgroup. */ static int account_shared_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -7920,7 +8014,11 @@ walk_down: child_gen = btrfs_node_ptr_generation(eb, parent_slot); eb = read_tree_block(root, child_bytenr, child_gen); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); ret = -EIO; goto out; } @@ -7931,16 +8029,6 @@ walk_down: btrfs_tree_read_lock(eb); btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); path->locks[level] = BTRFS_READ_LOCK_BLOCKING; - - ret = btrfs_qgroup_record_ref(trans, root->fs_info, - root->objectid, - child_bytenr, - root->nodesize, - BTRFS_QGROUP_OPER_SUB_SUBTREE, - 0); - if (ret) - goto out; - } if (level == 0) { @@ -8151,7 +8239,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (reada && level == 1) reada_walk_down(trans, root, wc, path); next = read_tree_block(root, bytenr, generation); - if (!next || !extent_buffer_uptodate(next)) { + if (IS_ERR(next)) { + return PTR_ERR(next); + } else if (!extent_buffer_uptodate(next)) { free_extent_buffer(next); return -EIO; } @@ -8533,24 +8623,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out_end_trans; } - /* - * Qgroup update accounting is run from - * delayed ref handling. This usually works - * out because delayed refs are normally the - * only way qgroup updates are added. However, - * we may have added updates during our tree - * walk so run qgroups here to make sure we - * don't lose any updates. - */ - ret = btrfs_delayed_qgroup_accounting(trans, - root->fs_info); - if (ret) - printk_ratelimited(KERN_ERR "BTRFS: Failure %d " - "running qgroup updates " - "during snapshot delete. " - "Quota is out of sync, " - "rescan required.\n", ret); - btrfs_end_transaction_throttle(trans, tree_root); if (!for_reloc && btrfs_need_cleaner_sleep(root)) { pr_debug("BTRFS: drop snapshot early exit\n"); @@ -8604,14 +8676,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } root_dropped = true; out_end_trans: - ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info); - if (ret) - printk_ratelimited(KERN_ERR "BTRFS: Failure %d " - "running qgroup updates " - "during snapshot delete. " - "Quota is out of sync, " - "rescan required.\n", ret); - btrfs_end_transaction_throttle(trans, tree_root); out_free: kfree(wc); @@ -8751,14 +8815,13 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) return flags; } -static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) +static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; u64 min_allocable_bytes; int ret = -ENOSPC; - /* * We need some metadata space and system metadata space for * allocating chunks in some corner cases until we force to set @@ -8775,6 +8838,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) spin_lock(&cache->lock); if (cache->ro) { + cache->ro++; ret = 0; goto out; } @@ -8786,7 +8850,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + min_allocable_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; - cache->ro = 1; + cache->ro++; list_add_tail(&cache->ro_list, &sinfo->ro_bgs); ret = 0; } @@ -8796,7 +8860,7 @@ out: return ret; } -int btrfs_set_block_group_ro(struct btrfs_root *root, +int btrfs_inc_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { @@ -8804,8 +8868,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, u64 alloc_flags; int ret; - BUG_ON(cache->ro); - again: trans = btrfs_join_transaction(root); if (IS_ERR(trans)) @@ -8848,7 +8910,7 @@ again: goto out; } - ret = set_block_group_ro(cache, 0); + ret = inc_block_group_ro(cache, 0); if (!ret) goto out; alloc_flags = get_alloc_profile(root, cache->space_info->flags); @@ -8856,7 +8918,7 @@ again: CHUNK_ALLOC_FORCE); if (ret < 0) goto out; - ret = set_block_group_ro(cache, 0); + ret = inc_block_group_ro(cache, 0); out: if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { alloc_flags = update_block_group_flags(root, cache->flags); @@ -8919,7 +8981,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) return free_bytes; } -void btrfs_set_block_group_rw(struct btrfs_root *root, +void btrfs_dec_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { struct btrfs_space_info *sinfo = cache->space_info; @@ -8929,11 +8991,13 @@ void btrfs_set_block_group_rw(struct btrfs_root *root, spin_lock(&sinfo->lock); spin_lock(&cache->lock); - num_bytes = cache->key.offset - cache->reserved - cache->pinned - - cache->bytes_super - btrfs_block_group_used(&cache->item); - sinfo->bytes_readonly -= num_bytes; - cache->ro = 0; - list_del_init(&cache->ro_list); + if (!--cache->ro) { + num_bytes = cache->key.offset - cache->reserved - + cache->pinned - cache->bytes_super - + btrfs_block_group_used(&cache->item); + sinfo->bytes_readonly -= num_bytes; + list_del_init(&cache->ro_list); + } spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); } @@ -9449,7 +9513,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) set_avail_alloc_bits(root->fs_info, cache->flags); if (btrfs_chunk_readonly(root, cache->key.objectid)) { - set_block_group_ro(cache, 1); + inc_block_group_ro(cache, 1); } else if (btrfs_block_group_used(&cache->item) == 0) { spin_lock(&info->unused_bgs_lock); /* Should always be true but just in case. */ @@ -9477,11 +9541,11 @@ int btrfs_read_block_groups(struct btrfs_root *root) list_for_each_entry(cache, &space_info->block_groups[BTRFS_RAID_RAID0], list) - set_block_group_ro(cache, 1); + inc_block_group_ro(cache, 1); list_for_each_entry(cache, &space_info->block_groups[BTRFS_RAID_SINGLE], list) - set_block_group_ro(cache, 1); + inc_block_group_ro(cache, 1); } init_global_block_rsv(info); @@ -9562,6 +9626,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, free_excluded_extents(root, cache); + /* + * Call to ensure the corresponding space_info object is created and + * assigned to our block group, but don't update its counters just yet. + * We want our bg to be added to the rbtree with its ->space_info set. + */ + ret = update_space_info(root->fs_info, cache->flags, 0, 0, + &cache->space_info); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + return ret; + } + ret = btrfs_add_block_group_cache(root->fs_info, cache); if (ret) { btrfs_remove_free_space_cache(cache); @@ -9569,6 +9646,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, return ret; } + /* + * Now that our block group has its ->space_info set and is inserted in + * the rbtree, update the space info's counters. + */ ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); if (ret) { @@ -9845,6 +9926,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * currently running transaction might finish and a new one start, * allowing for new block groups to be created that can reuse the same * physical device locations unless we take this special care. + * + * There may also be an implicit trim operation if the file system + * is mounted with -odiscard. The same protections must remain + * in place until the extents have been discarded completely when + * the transaction commit has completed. */ remove_em = (atomic_read(&block_group->trimming) == 0); /* @@ -9919,6 +10005,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->unused_bgs_lock); while (!list_empty(&fs_info->unused_bgs)) { u64 start, end; + int trimming; block_group = list_first_entry(&fs_info->unused_bgs, struct btrfs_block_group_cache, @@ -9931,6 +10018,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->unused_bgs_lock); + mutex_lock(&root->fs_info->delete_unused_bgs_mutex); + /* Don't want to race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); spin_lock(&block_group->lock); @@ -9950,7 +10039,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_unlock(&block_group->lock); /* We don't want to force the issue, only flip if it's ok. */ - ret = set_block_group_ro(block_group, 0); + ret = inc_block_group_ro(block_group, 0); up_write(&space_info->groups_sem); if (ret < 0) { ret = 0; @@ -9964,7 +10053,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) /* 1 for btrfs_orphan_reserve_metadata() */ trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { - btrfs_set_block_group_rw(root, block_group); + btrfs_dec_block_group_ro(root, block_group); ret = PTR_ERR(trans); goto next; } @@ -9991,14 +10080,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) EXTENT_DIRTY, GFP_NOFS); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_set_block_group_rw(root, block_group); + btrfs_dec_block_group_ro(root, block_group); goto end_trans; } ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, EXTENT_DIRTY, GFP_NOFS); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_set_block_group_rw(root, block_group); + btrfs_dec_block_group_ro(root, block_group); goto end_trans; } mutex_unlock(&fs_info->unused_bg_unpin_mutex); @@ -10016,15 +10105,43 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); + /* DISCARD can flip during remount */ + trimming = btrfs_test_opt(root, DISCARD); + + /* Implicit trim during transaction commit. */ + if (trimming) + btrfs_get_block_group_trimming(block_group); + /* * Btrfs_remove_chunk will abort the transaction if things go * horribly wrong. */ ret = btrfs_remove_chunk(trans, root, block_group->key.objectid); + + if (ret) { + if (trimming) + btrfs_put_block_group_trimming(block_group); + goto end_trans; + } + + /* + * If we're not mounted with -odiscard, we can just forget + * about this block group. Otherwise we'll need to wait + * until transaction commit to do the actual discard. + */ + if (trimming) { + WARN_ON(!list_empty(&block_group->bg_list)); + spin_lock(&trans->transaction->deleted_bgs_lock); + list_move(&block_group->bg_list, + &trans->transaction->deleted_bgs); + spin_unlock(&trans->transaction->deleted_bgs_lock); + btrfs_get_block_group(block_group); + } end_trans: btrfs_end_transaction(trans, root); next: + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); btrfs_put_block_group(block_group); spin_lock(&fs_info->unused_bgs_lock); } @@ -10074,10 +10191,99 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) return unpin_extent_range(root, start, end, false); } +/* + * It used to be that old block groups would be left around forever. + * Iterating over them would be enough to trim unused space. Since we + * now automatically remove them, we also need to iterate over unallocated + * space. + * + * We don't want a transaction for this since the discard may take a + * substantial amount of time. We don't require that a transaction be + * running, but we do need to take a running transaction into account + * to ensure that we're not discarding chunks that were released in + * the current transaction. + * + * Holding the chunks lock will prevent other threads from allocating + * or releasing chunks, but it won't prevent a running transaction + * from committing and releasing the memory that the pending chunks + * list head uses. For that, we need to take a reference to the + * transaction. + */ +static int btrfs_trim_free_extents(struct btrfs_device *device, + u64 minlen, u64 *trimmed) +{ + u64 start = 0, len = 0; + int ret; + + *trimmed = 0; + + /* Not writeable = nothing to do. */ + if (!device->writeable) + return 0; + + /* No free space = nothing to do. */ + if (device->total_bytes <= device->bytes_used) + return 0; + + ret = 0; + + while (1) { + struct btrfs_fs_info *fs_info = device->dev_root->fs_info; + struct btrfs_transaction *trans; + u64 bytes; + + ret = mutex_lock_interruptible(&fs_info->chunk_mutex); + if (ret) + return ret; + + down_read(&fs_info->commit_root_sem); + + spin_lock(&fs_info->trans_lock); + trans = fs_info->running_transaction; + if (trans) + atomic_inc(&trans->use_count); + spin_unlock(&fs_info->trans_lock); + + ret = find_free_dev_extent_start(trans, device, minlen, start, + &start, &len); + if (trans) + btrfs_put_transaction(trans); + + if (ret) { + up_read(&fs_info->commit_root_sem); + mutex_unlock(&fs_info->chunk_mutex); + if (ret == -ENOSPC) + ret = 0; + break; + } + + ret = btrfs_issue_discard(device->bdev, start, len, &bytes); + up_read(&fs_info->commit_root_sem); + mutex_unlock(&fs_info->chunk_mutex); + + if (ret) + break; + + start += len; + *trimmed += bytes; + + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + cond_resched(); + } + + return ret; +} + int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_group_cache *cache = NULL; + struct btrfs_device *device; + struct list_head *devices; u64 group_trimmed; u64 start; u64 end; @@ -10132,6 +10338,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) cache = next_block_group(fs_info->tree_root, cache); } + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + devices = &root->fs_info->fs_devices->alloc_list; + list_for_each_entry(device, devices, dev_alloc_list) { + ret = btrfs_trim_free_extents(device, range->minlen, + &group_trimmed); + if (ret) + break; + + trimmed += group_trimmed; + } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + range->len = trimmed; return ret; } diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h new file mode 100644 index 000000000000..e69de29bb2d1 --- /dev/null +++ b/fs/btrfs/extent-tree.h diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c374e1e71e5f..f1018cfbfefa 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1277,7 +1277,12 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask) { - return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); + int wake = 0; + + if (bits & EXTENT_LOCKED) + wake = 1; + + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); } int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, @@ -2481,7 +2486,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_writepage(struct bio *bio, int err) +static void end_bio_extent_writepage(struct bio *bio) { struct bio_vec *bvec; u64 start; @@ -2511,7 +2516,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err) start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; - if (end_extent_writepage(page, err, start, end)) + if (end_extent_writepage(page, bio->bi_error, start, end)) continue; end_page_writeback(page); @@ -2543,10 +2548,10 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_readpage(struct bio *bio, int err) +static void end_bio_extent_readpage(struct bio *bio) { struct bio_vec *bvec; - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + int uptodate = !bio->bi_error; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct extent_io_tree *tree; u64 offset = 0; @@ -2559,16 +2564,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err) int ret; int i; - if (err) - uptodate = 0; - bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " - "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err, - io_bio->mirror_num); + "mirror=%u\n", (u64)bio->bi_iter.bi_sector, + bio->bi_error, io_bio->mirror_num); tree = &BTRFS_I(inode)->io_tree; /* We always issue full-page reads, but if some block @@ -2609,8 +2611,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) if (tree->ops && tree->ops->readpage_io_failed_hook) { ret = tree->ops->readpage_io_failed_hook(page, mirror); - if (!ret && !err && - test_bit(BIO_UPTODATE, &bio->bi_flags)) + if (!ret && !bio->bi_error) uptodate = 1; } else { /* @@ -2626,10 +2627,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) ret = bio_readpage_error(bio, offset, page, start, end, mirror); if (ret == 0) { - uptodate = - test_bit(BIO_UPTODATE, &bio->bi_flags); - if (err) - uptodate = 0; + uptodate = !bio->bi_error; offset += len; continue; } @@ -2679,7 +2677,7 @@ readpage_ok: endio_readpage_release_extent(tree, extent_start, extent_len, uptodate); if (io_bio->end_io) - io_bio->end_io(io_bio, err); + io_bio->end_io(io_bio, bio->bi_error); bio_put(bio); } @@ -2725,6 +2723,12 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) btrfs_bio->csum = NULL; btrfs_bio->csum_allocated = NULL; btrfs_bio->end_io = NULL; + +#ifdef CONFIG_BLK_CGROUP + /* FIXME, put this into bio_clone_bioset */ + if (bio->bi_css) + bio_associate_blkcg(new, bio->bi_css); +#endif } return new; } @@ -2785,6 +2789,7 @@ static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page, } static int submit_extent_page(int rw, struct extent_io_tree *tree, + struct writeback_control *wbc, struct page *page, sector_t sector, size_t size, unsigned long offset, struct block_device *bdev, @@ -2797,9 +2802,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, { int ret = 0; struct bio *bio; - int nr; int contig = 0; - int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); @@ -2821,21 +2824,24 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, } bio = NULL; } else { + if (wbc) + wbc_account_io(wbc, page, page_size); return 0; } } - if (this_compressed) - nr = BIO_MAX_PAGES; - else - nr = bio_get_nr_vecs(bdev); - bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); + bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES, + GFP_NOFS | __GFP_HIGH); if (!bio) return -ENOMEM; bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; + if (wbc) { + wbc_init_bio(wbc, bio); + wbc_account_io(wbc, page, page_size); + } if (bio_ret) *bio_ret = bio; @@ -3046,7 +3052,7 @@ static int __do_readpage(struct extent_io_tree *tree, } pnr -= page->index; - ret = submit_extent_page(rw, tree, page, + ret = submit_extent_page(rw, tree, NULL, page, sector, disk_io_size, pg_offset, bdev, bio, pnr, end_bio_extent_readpage, mirror_num, @@ -3441,7 +3447,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, page->index, cur, end); } - ret = submit_extent_page(write_flags, tree, page, + ret = submit_extent_page(write_flags, tree, wbc, page, sector, iosize, pg_offset, bdev, &epd->bio, max_nr, end_bio_extent_writepage, @@ -3691,7 +3697,7 @@ static void set_btree_ioerr(struct page *page) } } -static void end_bio_extent_buffer_writepage(struct bio *bio, int err) +static void end_bio_extent_buffer_writepage(struct bio *bio) { struct bio_vec *bvec; struct extent_buffer *eb; @@ -3704,7 +3710,8 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err) BUG_ON(!eb); done = atomic_dec_and_test(&eb->io_pages); - if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { + if (bio->bi_error || + test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { ClearPageUptodate(page); set_btree_ioerr(page); } @@ -3744,7 +3751,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); - ret = submit_extent_page(rw, tree, p, offset >> 9, + ret = submit_extent_page(rw, tree, wbc, p, offset >> 9, PAGE_CACHE_SIZE, 0, bdev, &epd->bio, -1, end_bio_extent_buffer_writepage, 0, epd->bio_flags, bio_flags); @@ -4490,6 +4497,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + flags |= FIEMAP_EXTENT_UNWRITTEN; free_extent_map(em); em = NULL; @@ -4607,9 +4616,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, { struct extent_buffer *eb = NULL; - eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS); - if (eb == NULL) - return NULL; + eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); eb->start = start; eb->len = len; eb->fs_info = fs_info; @@ -4867,7 +4874,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, return NULL; for (i = 0; i < num_pages; i++, index++) { - p = find_or_create_page(mapping, index, GFP_NOFS); + p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); if (!p) goto free_eb; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b072e17479aa..b823fac91c92 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1748,7 +1748,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, } current->backing_dev_info = inode_to_bdi(inode); - err = file_remove_suid(file); + err = file_remove_privs(file); if (err) { mutex_unlock(&inode->i_mutex); goto out; @@ -1868,6 +1868,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_log_ctx ctx; int ret = 0; bool full_sync = 0; + const u64 len = end - start + 1; trace_btrfs_sync_file(file, datasync); @@ -1896,7 +1897,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * all extents are persisted and the respective file extent * items are in the fs/subvol btree. */ - ret = btrfs_wait_ordered_range(inode, start, end - start + 1); + ret = btrfs_wait_ordered_range(inode, start, len); } else { /* * Start any new ordered operations before starting to log the @@ -1968,8 +1969,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ smp_mb(); if (btrfs_inode_in_log(inode, root->fs_info->generation) || - (full_sync && BTRFS_I(inode)->last_trans <= - root->fs_info->last_trans_committed)) { + (BTRFS_I(inode)->last_trans <= + root->fs_info->last_trans_committed && + (full_sync || + !btrfs_have_ordered_extents_in_range(inode, start, len)))) { /* * We'v had everything committed since the last time we were * modified so clear this flag in case it was set for whatever diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 9dbe5b548fa6..abe3a66bd3ba 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -231,6 +231,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, { int ret = 0; struct btrfs_path *path = btrfs_alloc_path(); + bool locked = false; if (!path) { ret = -ENOMEM; @@ -238,6 +239,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, } if (block_group) { + locked = true; mutex_lock(&trans->transaction->cache_write_mutex); if (!list_empty(&block_group->io_list)) { list_del_init(&block_group->io_list); @@ -269,18 +271,14 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, */ ret = btrfs_truncate_inode_items(trans, root, inode, 0, BTRFS_EXTENT_DATA_KEY); - if (ret) { - mutex_unlock(&trans->transaction->cache_write_mutex); - btrfs_abort_transaction(trans, root, ret); - return ret; - } + if (ret) + goto fail; ret = btrfs_update_inode(trans, root, inode); - if (block_group) - mutex_unlock(&trans->transaction->cache_write_mutex); - fail: + if (locked) + mutex_unlock(&trans->transaction->cache_write_mutex); if (ret) btrfs_abort_transaction(trans, root, ret); @@ -3274,35 +3272,23 @@ next: return ret; } -int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, - u64 *trimmed, u64 start, u64 end, u64 minlen) +void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache) { - int ret; + atomic_inc(&cache->trimming); +} - *trimmed = 0; +void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + bool cleanup; spin_lock(&block_group->lock); - if (block_group->removed) { - spin_unlock(&block_group->lock); - return 0; - } - atomic_inc(&block_group->trimming); + cleanup = (atomic_dec_and_test(&block_group->trimming) && + block_group->removed); spin_unlock(&block_group->lock); - ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); - if (ret) - goto out; - - ret = trim_bitmaps(block_group, trimmed, start, end, minlen); -out: - spin_lock(&block_group->lock); - if (atomic_dec_and_test(&block_group->trimming) && - block_group->removed) { - struct extent_map_tree *em_tree; - struct extent_map *em; - - spin_unlock(&block_group->lock); - + if (cleanup) { lock_chunks(block_group->fs_info->chunk_root); em_tree = &block_group->fs_info->mapping_tree.map_tree; write_lock(&em_tree->lock); @@ -3326,10 +3312,31 @@ out: * this block group have left 1 entry each one. Free them. */ __btrfs_remove_free_space_cache(block_group->free_space_ctl); - } else { + } +} + +int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen) +{ + int ret; + + *trimmed = 0; + + spin_lock(&block_group->lock); + if (block_group->removed) { spin_unlock(&block_group->lock); + return 0; } + btrfs_get_block_group_trimming(block_group); + spin_unlock(&block_group->lock); + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); + if (ret) + goto out; + + ret = trim_bitmaps(block_group, trimmed, start, end, minlen); +out: + btrfs_put_block_group_trimming(block_group); return ret; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index f6a596d5a637..d4a582ac3f73 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -246,6 +246,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) { struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset; + spinlock_t *rbroot_lock = &root->free_ino_pinned->tree_lock; struct btrfs_free_space *info; struct rb_node *n; u64 count; @@ -254,24 +255,30 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) return; while (1) { + bool add_to_ctl = true; + + spin_lock(rbroot_lock); n = rb_first(rbroot); - if (!n) + if (!n) { + spin_unlock(rbroot_lock); break; + } info = rb_entry(n, struct btrfs_free_space, offset_index); BUG_ON(info->bitmap); /* Logic error */ if (info->offset > root->ino_cache_progress) - goto free; + add_to_ctl = false; else if (info->offset + info->bytes > root->ino_cache_progress) count = root->ino_cache_progress - info->offset + 1; else count = info->bytes; - __btrfs_add_free_space(ctl, info->offset, count); -free: rb_erase(&info->offset_index, rbroot); - kfree(info); + spin_unlock(rbroot_lock); + if (add_to_ctl) + __btrfs_add_free_space(ctl, info->offset, count); + kmem_cache_free(btrfs_free_space_cachep, info); } } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8bb013672aee..237da012f7d0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1845,8 +1845,10 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, int ret; ret = btrfs_map_bio(root, rw, bio, mirror_num, 1); - if (ret) - bio_endio(bio, ret); + if (ret) { + bio->bi_error = ret; + bio_endio(bio); + } return ret; } @@ -1906,8 +1908,10 @@ mapit: ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); out: - if (ret < 0) - bio_endio(bio, ret); + if (ret < 0) { + bio->bi_error = ret; + bio_endio(bio); + } return ret; } @@ -3654,6 +3658,35 @@ cache_index: set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + /* + * We don't persist the id of the transaction where an unlink operation + * against the inode was last made. So here we assume the inode might + * have been evicted, and therefore the exact value of last_unlink_trans + * lost, and set it to last_trans to avoid metadata inconsistencies + * between the inode and its parent if the inode is fsync'ed and the log + * replayed. For example, in the scenario: + * + * touch mydir/foo + * ln mydir/foo mydir/bar + * sync + * unlink mydir/bar + * echo 2 > /proc/sys/vm/drop_caches # evicts inode + * xfs_io -c fsync mydir/foo + * <power failure> + * mount fs, triggers fsync log replay + * + * We must make sure that when we fsync our inode foo we also log its + * parent inode, otherwise after log replay the parent still has the + * dentry with the "bar" name but our inode foo has a link count of 1 + * and doesn't have an inode ref with the name "bar" anymore. + * + * Setting last_unlink_trans to last_trans is a pessimistic approach, + * but it guarantees correctness at the expense of ocassional full + * transaction commits on fsync if our inode is a directory, or if our + * inode is not a directory, logging its parent unnecessarily. + */ + BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + path->slots[0]++; if (inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) @@ -4209,7 +4242,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 extent_num_bytes = 0; u64 extent_offset = 0; u64 item_end = 0; - u64 last_size = (u64)-1; + u64 last_size = new_size; u32 found_type = (u8)-1; int found_extent; int del_item; @@ -4493,8 +4526,7 @@ out: btrfs_abort_transaction(trans, root, ret); } error: - if (last_size != (u64)-1 && - root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) btrfs_ordered_update_i_size(inode, last_size, NULL); btrfs_free_path(path); @@ -4986,24 +5018,41 @@ static void evict_inode_truncate_pages(struct inode *inode) } write_unlock(&map_tree->lock); + /* + * Keep looping until we have no more ranges in the io tree. + * We can have ongoing bios started by readpages (called from readahead) + * that have their endio callback (extent_io.c:end_bio_extent_readpage) + * still in progress (unlocked the pages in the bio but did not yet + * unlocked the ranges in the io tree). Therefore this means some + * ranges can still be locked and eviction started because before + * submitting those bios, which are executed by a separate task (work + * queue kthread), inode references (inode->i_count) were not taken + * (which would be dropped in the end io callback of each bio). + * Therefore here we effectively end up waiting for those bios and + * anyone else holding locked ranges without having bumped the inode's + * reference count - if we don't do it, when they access the inode's + * io_tree to unlock a range it may be too late, leading to an + * use-after-free issue. + */ spin_lock(&io_tree->lock); while (!RB_EMPTY_ROOT(&io_tree->state)) { struct extent_state *state; struct extent_state *cached_state = NULL; + u64 start; + u64 end; node = rb_first(&io_tree->state); state = rb_entry(node, struct extent_state, rb_node); - atomic_inc(&state->refs); + start = state->start; + end = state->end; spin_unlock(&io_tree->lock); - lock_extent_bits(io_tree, state->start, state->end, - 0, &cached_state); - clear_extent_bit(io_tree, state->start, state->end, + lock_extent_bits(io_tree, start, end, 0, &cached_state); + clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, &cached_state, GFP_NOFS); - free_extent_state(state); cond_resched(); spin_lock(&io_tree->lock); @@ -7530,6 +7579,7 @@ unlock: current->journal_info = outstanding_extents; btrfs_free_reserved_data_space(inode, len); + set_bit(BTRFS_INODE_DIO_READY, &BTRFS_I(inode)->runtime_flags); } /* @@ -7671,13 +7721,13 @@ struct btrfs_retry_complete { int uptodate; }; -static void btrfs_retry_endio_nocsum(struct bio *bio, int err) +static void btrfs_retry_endio_nocsum(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; struct bio_vec *bvec; int i; - if (err) + if (bio->bi_error) goto end; done->uptodate = 1; @@ -7726,7 +7776,7 @@ try_again: return 0; } -static void btrfs_retry_endio(struct bio *bio, int err) +static void btrfs_retry_endio(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); @@ -7735,7 +7785,7 @@ static void btrfs_retry_endio(struct bio *bio, int err) int ret; int i; - if (err) + if (bio->bi_error) goto end; uptodate = 1; @@ -7818,12 +7868,13 @@ static int btrfs_subio_endio_read(struct inode *inode, } } -static void btrfs_endio_direct_read(struct bio *bio, int err) +static void btrfs_endio_direct_read(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; struct inode *inode = dip->inode; struct bio *dio_bio; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + int err = bio->bi_error; if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) err = btrfs_subio_endio_read(inode, io_bio, err); @@ -7834,17 +7885,14 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) kfree(dip); - /* If we had a csum failure make sure to clear the uptodate flag */ - if (err) - clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); - dio_end_io(dio_bio, err); + dio_end_io(dio_bio, bio->bi_error); if (io_bio->end_io) io_bio->end_io(io_bio, err); bio_put(bio); } -static void btrfs_endio_direct_write(struct bio *bio, int err) +static void btrfs_endio_direct_write(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; struct inode *inode = dip->inode; @@ -7855,12 +7903,11 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) struct bio *dio_bio; int ret; - if (err) - goto out_done; again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, - ordered_bytes, !err); + ordered_bytes, + !bio->bi_error); if (!ret) goto out_test; @@ -7879,15 +7926,11 @@ out_test: ordered = NULL; goto again; } -out_done: dio_bio = dip->dio_bio; kfree(dip); - /* If we had an error make sure to clear the uptodate flag */ - if (err) - clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); - dio_end_io(dio_bio, err); + dio_end_io(dio_bio, bio->bi_error); bio_put(bio); } @@ -7902,9 +7945,10 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, return 0; } -static void btrfs_end_dio_bio(struct bio *bio, int err) +static void btrfs_end_dio_bio(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; + int err = bio->bi_error; if (err) btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, @@ -7933,8 +7977,8 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) if (dip->errors) { bio_io_error(dip->orig_bio); } else { - set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags); - bio_endio(dip->orig_bio, 0); + dip->dio_bio->bi_error = 0; + bio_endio(dip->orig_bio); } out: bio_put(bio); @@ -7943,8 +7987,11 @@ out: static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, u64 first_sector, gfp_t gfp_flags) { - int nr_vecs = bio_get_nr_vecs(bdev); - return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); + struct bio *bio; + bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags); + if (bio) + bio_associate_current(bio); + return bio; } static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root, @@ -8147,9 +8194,8 @@ out_err: static void btrfs_submit_direct(int rw, struct bio *dio_bio, struct inode *inode, loff_t file_offset) { - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_dio_private *dip; - struct bio *io_bio; + struct btrfs_dio_private *dip = NULL; + struct bio *io_bio = NULL; struct btrfs_io_bio *btrfs_bio; int skip_sum; int write = rw & REQ_WRITE; @@ -8166,7 +8212,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dip = kzalloc(sizeof(*dip), GFP_NOFS); if (!dip) { ret = -ENOMEM; - goto free_io_bio; + goto free_ordered; } dip->private = dio_bio->bi_private; @@ -8194,25 +8240,56 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, if (btrfs_bio->end_io) btrfs_bio->end_io(btrfs_bio, ret); -free_io_bio: - bio_put(io_bio); free_ordered: /* - * If this is a write, we need to clean up the reserved space and kill - * the ordered extent. + * If we arrived here it means either we failed to submit the dip + * or we either failed to clone the dio_bio or failed to allocate the + * dip. If we cloned the dio_bio and allocated the dip, we can just + * call bio_endio against our io_bio so that we get proper resource + * cleanup if we fail to submit the dip, otherwise, we must do the + * same as btrfs_endio_direct_[write|read] because we can't call these + * callbacks - they require an allocated dip and a clone of dio_bio. */ - if (write) { - struct btrfs_ordered_extent *ordered; - ordered = btrfs_lookup_ordered_extent(inode, file_offset); - if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && - !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) - btrfs_free_reserved_extent(root, ordered->start, - ordered->disk_len, 1); - btrfs_put_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); + if (io_bio && dip) { + io_bio->bi_error = -EIO; + bio_endio(io_bio); + /* + * The end io callbacks free our dip, do the final put on io_bio + * and all the cleanup and final put for dio_bio (through + * dio_end_io()). + */ + dip = NULL; + io_bio = NULL; + } else { + if (write) { + struct btrfs_ordered_extent *ordered; + + ordered = btrfs_lookup_ordered_extent(inode, + file_offset); + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); + /* + * Decrements our ref on the ordered extent and removes + * the ordered extent from the inode's ordered tree, + * doing all the proper resource cleanup such as for the + * reserved space and waking up any waiters for this + * ordered extent (through btrfs_remove_ordered_extent). + */ + btrfs_finish_ordered_io(ordered); + } else { + unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, + file_offset + dio_bio->bi_iter.bi_size - 1); + } + dio_bio->bi_error = -EIO; + /* + * Releases and cleans up our dio_bio, no need to bio_put() + * nor bio_endio()/bio_io_error() against dio_bio. + */ + dio_end_io(dio_bio, ret); } - bio_endio(dio_bio, ret); + if (io_bio) + bio_put(io_bio); + kfree(dip); } static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb, @@ -8314,9 +8391,18 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, btrfs_submit_direct, flags); if (iov_iter_rw(iter) == WRITE) { current->journal_info = NULL; - if (ret < 0 && ret != -EIOCBQUEUED) - btrfs_delalloc_release_space(inode, count); - else if (ret >= 0 && (size_t)ret < count) + if (ret < 0 && ret != -EIOCBQUEUED) { + /* + * If the error comes from submitting stage, + * btrfs_get_blocsk_direct() has free'd data space, + * and metadata space will be handled by + * finish_ordered_fn, don't do that again to make + * sure bytes_may_use is correct. + */ + if (!test_and_clear_bit(BTRFS_INODE_DIO_READY, + &BTRFS_I(inode)->runtime_flags)) + btrfs_delalloc_release_space(inode, count); + } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, count - (size_t)ret); } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1c22c6518504..0adf5422fce9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -87,7 +87,8 @@ struct btrfs_ioctl_received_subvol_args_32 { static int btrfs_clone(struct inode *src, struct inode *inode, - u64 off, u64 olen, u64 olen_aligned, u64 destoff); + u64 off, u64 olen, u64 olen_aligned, u64 destoff, + int no_time_update); /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -553,8 +554,8 @@ static noinline int create_subvol(struct inode *dir, key.offset = (u64)-1; new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); if (IS_ERR(new_root)) { - btrfs_abort_transaction(trans, root, PTR_ERR(new_root)); ret = PTR_ERR(new_root); + btrfs_abort_transaction(trans, root, ret); goto fail; } @@ -1029,6 +1030,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u32 thresh, struct extent_map *em; int ret = 1; bool next_mergeable = true; + bool prev_mergeable = true; /* * make sure that once we start defragging an extent, we keep on @@ -1049,13 +1051,16 @@ static int should_defrag_range(struct inode *inode, u64 start, u32 thresh, goto out; } + if (!*defrag_end) + prev_mergeable = false; + next_mergeable = defrag_check_next_extent(inode, em); /* * we hit a real extent, if it is big or the next extent is not a * real extent, don't bother defragging it */ if (!compress && (*last_len == 0 || *last_len >= thresh) && - (em->len >= thresh || !next_mergeable)) + (em->len >= thresh || (!next_mergeable && !prev_mergeable))) ret = 0; out: /* @@ -1318,7 +1323,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, i = range->start >> PAGE_CACHE_SHIFT; } if (!max_to_defrag) - max_to_defrag = last_index + 1; + max_to_defrag = last_index - i + 1; /* * make writeback starts from i, so the defrag range can be @@ -1368,7 +1373,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra_index = max(i, ra_index); btrfs_force_ra(inode->i_mapping, ra, file, ra_index, cluster); - ra_index += max_cluster; + ra_index += cluster; } mutex_lock(&inode->i_mutex); @@ -1932,6 +1937,7 @@ static noinline int copy_to_sk(struct btrfs_root *root, u64 found_transid; struct extent_buffer *leaf; struct btrfs_ioctl_search_header sh; + struct btrfs_key test; unsigned long item_off; unsigned long item_len; int nritems; @@ -2015,12 +2021,17 @@ static noinline int copy_to_sk(struct btrfs_root *root, } advance_key: ret = 0; - if (key->offset < (u64)-1 && key->offset < sk->max_offset) + test.objectid = sk->max_objectid; + test.type = sk->max_type; + test.offset = sk->max_offset; + if (btrfs_comp_cpu_keys(key, &test) >= 0) + ret = 1; + else if (key->offset < (u64)-1) key->offset++; - else if (key->type < (u8)-1 && key->type < sk->max_type) { + else if (key->type < (u8)-1) { key->offset = 0; key->type++; - } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) { + } else if (key->objectid < (u64)-1) { key->offset = 0; key->type = 0; key->objectid++; @@ -2271,10 +2282,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, { struct btrfs_ioctl_ino_lookup_args *args; struct inode *inode; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + int ret = 0; args = memdup_user(argp, sizeof(*args)); if (IS_ERR(args)) @@ -2282,13 +2290,28 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, inode = file_inode(file); + /* + * Unprivileged query to obtain the containing subvolume root id. The + * path is reset so it's consistent with btrfs_search_path_in_tree. + */ if (args->treeid == 0) args->treeid = BTRFS_I(inode)->root->root_key.objectid; + if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { + args->name[0] = 0; + goto out; + } + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; + } + ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, args->treeid, args->objectid, args->name); +out: if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) ret = -EFAULT; @@ -2413,8 +2436,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_unlock_inode; } - d_invalidate(dentry); - down_write(&root->fs_info->subvol_sem); err = may_destroy_subvol(dest); @@ -2508,7 +2529,7 @@ out_up_write: out_unlock_inode: mutex_unlock(&inode->i_mutex); if (!err) { - shrink_dcache_sb(root->fs_info->sb); + d_invalidate(dentry); btrfs_invalidate_inodes(dest); d_delete(dentry); ASSERT(dest->send_in_progress == 0); @@ -2755,14 +2776,11 @@ out: return ret; } -static struct page *extent_same_get_page(struct inode *inode, u64 off) +static struct page *extent_same_get_page(struct inode *inode, pgoff_t index) { struct page *page; - pgoff_t index; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - index = off >> PAGE_CACHE_SHIFT; - page = grab_cache_page(inode->i_mapping, index); if (!page) return NULL; @@ -2783,6 +2801,20 @@ static struct page *extent_same_get_page(struct inode *inode, u64 off) return page; } +static int gather_extent_pages(struct inode *inode, struct page **pages, + int num_pages, u64 off) +{ + int i; + pgoff_t index = off >> PAGE_CACHE_SHIFT; + + for (i = 0; i < num_pages; i++) { + pages[i] = extent_same_get_page(inode, index + i); + if (!pages[i]) + return -ENOMEM; + } + return 0; +} + static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) { /* do any pending delalloc/csum calc on src, one way or @@ -2808,52 +2840,118 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) } } -static void btrfs_double_unlock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) +static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) { - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); - mutex_unlock(&inode1->i_mutex); mutex_unlock(&inode2->i_mutex); } -static void btrfs_double_lock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) +static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) +{ + if (inode1 < inode2) + swap(inode1, inode2); + + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); +} + +static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); +} + +static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) { if (inode1 < inode2) { swap(inode1, inode2); swap(loff1, loff2); } - - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); lock_extent_range(inode1, loff1, len); - if (inode1 != inode2) { - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); - lock_extent_range(inode2, loff2, len); + lock_extent_range(inode2, loff2, len); +} + +struct cmp_pages { + int num_pages; + struct page **src_pages; + struct page **dst_pages; +}; + +static void btrfs_cmp_data_free(struct cmp_pages *cmp) +{ + int i; + struct page *pg; + + for (i = 0; i < cmp->num_pages; i++) { + pg = cmp->src_pages[i]; + if (pg) + page_cache_release(pg); + pg = cmp->dst_pages[i]; + if (pg) + page_cache_release(pg); + } + kfree(cmp->src_pages); + kfree(cmp->dst_pages); +} + +static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, + struct inode *dst, u64 dst_loff, + u64 len, struct cmp_pages *cmp) +{ + int ret; + int num_pages = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT; + struct page **src_pgarr, **dst_pgarr; + + /* + * We must gather up all the pages before we initiate our + * extent locking. We use an array for the page pointers. Size + * of the array is bounded by len, which is in turn bounded by + * BTRFS_MAX_DEDUPE_LEN. + */ + src_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS); + dst_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS); + if (!src_pgarr || !dst_pgarr) { + kfree(src_pgarr); + kfree(dst_pgarr); + return -ENOMEM; } + cmp->num_pages = num_pages; + cmp->src_pages = src_pgarr; + cmp->dst_pages = dst_pgarr; + + ret = gather_extent_pages(src, cmp->src_pages, cmp->num_pages, loff); + if (ret) + goto out; + + ret = gather_extent_pages(dst, cmp->dst_pages, cmp->num_pages, dst_loff); + +out: + if (ret) + btrfs_cmp_data_free(cmp); + return 0; } static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, - u64 dst_loff, u64 len) + u64 dst_loff, u64 len, struct cmp_pages *cmp) { int ret = 0; + int i; struct page *src_page, *dst_page; unsigned int cmp_len = PAGE_CACHE_SIZE; void *addr, *dst_addr; + i = 0; while (len) { if (len < PAGE_CACHE_SIZE) cmp_len = len; - src_page = extent_same_get_page(src, loff); - if (!src_page) - return -EINVAL; - dst_page = extent_same_get_page(dst, dst_loff); - if (!dst_page) { - page_cache_release(src_page); - return -EINVAL; - } + BUG_ON(i >= cmp->num_pages); + + src_page = cmp->src_pages[i]; + dst_page = cmp->dst_pages[i]; + addr = kmap_atomic(src_page); dst_addr = kmap_atomic(dst_page); @@ -2865,26 +2963,30 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, kunmap_atomic(addr); kunmap_atomic(dst_addr); - page_cache_release(src_page); - page_cache_release(dst_page); if (ret) break; - loff += cmp_len; - dst_loff += cmp_len; len -= cmp_len; + i++; } return ret; } -static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) +static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen, + u64 olen) { + u64 len = *plen; u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; - if (off + len > inode->i_size || off + len < off) + if (off + olen > inode->i_size || off + olen < off) return -EINVAL; + + /* if we extend to eof, continue to block boundary */ + if (off + len == inode->i_size) + *plen = len = ALIGN(inode->i_size, bs) - off; + /* Check that we are block aligned - btrfs_clone() requires this */ if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) return -EINVAL; @@ -2892,31 +2994,67 @@ static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) return 0; } -static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, +static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { int ret; + u64 len = olen; + struct cmp_pages cmp; + int same_inode = 0; + u64 same_lock_start = 0; + u64 same_lock_len = 0; - /* - * btrfs_clone() can't handle extents in the same file - * yet. Once that works, we can drop this check and replace it - * with a check for the same inode, but overlapping extents. - */ if (src == dst) - return -EINVAL; + same_inode = 1; if (len == 0) return 0; - btrfs_double_lock(src, loff, dst, dst_loff, len); + if (same_inode) { + mutex_lock(&src->i_mutex); + + ret = extent_same_check_offsets(src, loff, &len, olen); + if (ret) + goto out_unlock; - ret = extent_same_check_offsets(src, loff, len); - if (ret) - goto out_unlock; + /* + * Single inode case wants the same checks, except we + * don't want our length pushed out past i_size as + * comparing that data range makes no sense. + * + * extent_same_check_offsets() will do this for an + * unaligned length at i_size, so catch it here and + * reject the request. + * + * This effectively means we require aligned extents + * for the single-inode case, whereas the other cases + * allow an unaligned length so long as it ends at + * i_size. + */ + if (len != olen) { + ret = -EINVAL; + goto out_unlock; + } - ret = extent_same_check_offsets(dst, dst_loff, len); - if (ret) - goto out_unlock; + /* Check for overlapping ranges */ + if (dst_loff + len > loff && dst_loff < loff + len) { + ret = -EINVAL; + goto out_unlock; + } + + same_lock_start = min_t(u64, loff, dst_loff); + same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start; + } else { + btrfs_double_inode_lock(src, dst); + + ret = extent_same_check_offsets(src, loff, &len, olen); + if (ret) + goto out_unlock; + + ret = extent_same_check_offsets(dst, dst_loff, &len, olen); + if (ret) + goto out_unlock; + } /* don't make the dst file partly checksummed */ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != @@ -2925,12 +3063,32 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, goto out_unlock; } - ret = btrfs_cmp_data(src, loff, dst, dst_loff, len); + ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp); + if (ret) + goto out_unlock; + + if (same_inode) + lock_extent_range(src, same_lock_start, same_lock_len); + else + btrfs_double_extent_lock(src, loff, dst, dst_loff, len); + + /* pass original length for comparison so we stay within i_size */ + ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp); if (ret == 0) - ret = btrfs_clone(src, dst, loff, len, len, dst_loff); + ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); + + if (same_inode) + unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start, + same_lock_start + same_lock_len - 1); + else + btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + btrfs_cmp_data_free(&cmp); out_unlock: - btrfs_double_unlock(src, loff, dst, dst_loff, len); + if (same_inode) + mutex_unlock(&src->i_mutex); + else + btrfs_double_inode_unlock(src, dst); return ret; } @@ -2940,7 +3098,7 @@ out_unlock: static long btrfs_ioctl_file_extent_same(struct file *file, struct btrfs_ioctl_same_args __user *argp) { - struct btrfs_ioctl_same_args *same; + struct btrfs_ioctl_same_args *same = NULL; struct btrfs_ioctl_same_extent_info *info; struct inode *src = file_inode(file); u64 off; @@ -2970,6 +3128,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file, if (IS_ERR(same)) { ret = PTR_ERR(same); + same = NULL; goto out; } @@ -3040,6 +3199,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file, out: mnt_drop_write_file(file); + kfree(same); return ret; } @@ -3082,13 +3242,15 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, struct inode *inode, u64 endoff, const u64 destoff, - const u64 olen) + const u64 olen, + int no_time_update) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; inode_inc_iversion(inode); - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + if (!no_time_update) + inode->i_mtime = inode->i_ctime = CURRENT_TIME; /* * We round up to the block size at eof when determining which * extents to clone above, but shouldn't round up the file size. @@ -3173,13 +3335,13 @@ static void clone_update_extent_map(struct inode *inode, * @inode: Inode to clone to * @off: Offset within source to start clone from * @olen: Original length, passed by user, of range to clone - * @olen_aligned: Block-aligned value of olen, extent_same uses - * identical values here + * @olen_aligned: Block-aligned value of olen * @destoff: Offset within @inode to start clone + * @no_time_update: Whether to update mtime/ctime on the target inode */ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, - const u64 destoff) + const u64 destoff, int no_time_update) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path = NULL; @@ -3434,6 +3596,20 @@ process_slot: u64 trim = 0; u64 aligned_end = 0; + /* + * Don't copy an inline extent into an offset + * greater than zero. Having an inline extent + * at such an offset results in chaos as btrfs + * isn't prepared for such cases. Just skip + * this case for the same reasons as commented + * at btrfs_ioctl_clone(). + */ + if (last_dest_end > 0) { + ret = -EOPNOTSUPP; + btrfs_end_transaction(trans, root); + goto out; + } + if (off > key.offset) { skip = off - key.offset; new_key.offset += skip; @@ -3503,7 +3679,8 @@ process_slot: root->sectorsize); ret = clone_finish_inode_update(trans, inode, last_dest_end, - destoff, olen); + destoff, olen, + no_time_update); if (ret) goto out; if (new_key.offset + datal >= destoff + len) @@ -3541,7 +3718,7 @@ process_slot: clone_update_extent_map(inode, trans, NULL, last_dest_end, destoff + len - last_dest_end); ret = clone_finish_inode_update(trans, inode, destoff + len, - destoff, olen); + destoff, olen, no_time_update); } out: @@ -3618,13 +3795,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, goto out_fput; if (!same_inode) { - if (inode < src) { - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); - } else { - mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); - } + btrfs_double_inode_lock(src, inode); } else { mutex_lock(&src->i_mutex); } @@ -3674,11 +3845,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, lock_extent_range(src, lock_start, lock_len); } else { - lock_extent_range(src, off, len); - lock_extent_range(inode, destoff, len); + btrfs_double_extent_lock(src, off, inode, destoff, len); } - ret = btrfs_clone(src, inode, off, olen, len, destoff); + ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); if (same_inode) { u64 lock_start = min_t(u64, off, destoff); @@ -3686,9 +3856,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end); } else { - unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); - unlock_extent(&BTRFS_I(inode)->io_tree, destoff, - destoff + len - 1); + btrfs_double_extent_unlock(src, off, inode, destoff, len); } /* * Truncate page cache pages so that future reads will see the cloned @@ -3697,17 +3865,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, truncate_inode_pages_range(&inode->i_data, destoff, PAGE_CACHE_ALIGN(destoff + len) - 1); out_unlock: - if (!same_inode) { - if (inode < src) { - mutex_unlock(&src->i_mutex); - mutex_unlock(&inode->i_mutex); - } else { - mutex_unlock(&inode->i_mutex); - mutex_unlock(&src->i_mutex); - } - } else { + if (!same_inode) + btrfs_double_inode_unlock(src, inode); + else mutex_unlock(&src->i_mutex); - } out_fput: fdput(src_file); out_drop_write: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index f8229ef1b46d..d7e6baf1b205 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -241,6 +241,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) */ void btrfs_tree_lock(struct extent_buffer *eb) { + WARN_ON(eb->lock_owner == current->pid); again: wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 760c4a5e096b..52170cf1757e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -198,9 +198,6 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->file_offset = file_offset; entry->start = start; entry->len = len; - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) && - !(type == BTRFS_ORDERED_NOCOW)) - entry->csum_bytes_left = disk_len; entry->disk_len = disk_len; entry->bytes_left = len; entry->inode = igrab(inode); @@ -286,10 +283,6 @@ void btrfs_add_ordered_sum(struct inode *inode, tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irq(&tree->lock); list_add_tail(&sum->list, &entry->list); - WARN_ON(entry->csum_bytes_left < sum->len); - entry->csum_bytes_left -= sum->len; - if (entry->csum_bytes_left == 0) - wake_up(&entry->wait); spin_unlock_irq(&tree->lock); } @@ -509,7 +502,21 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)); - list_add_tail(&ordered->trans_list, &trans->ordered); + /* + * If our ordered extent completed it means it updated the + * fs/subvol and csum trees already, so no need to make the + * current transaction's commit wait for it, as we end up + * holding memory unnecessarily and delaying the inode's iput + * until the transaction commit (we schedule an iput for the + * inode when the ordered extent's refcount drops to 0), which + * prevents it from being evictable until the transaction + * commits. + */ + if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) + btrfs_put_ordered_extent(ordered); + else + list_add_tail(&ordered->trans_list, &trans->ordered); + spin_lock_irq(&log->log_extents_lock[index]); } spin_unlock_irq(&log->log_extents_lock[index]); @@ -545,6 +552,10 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) trace_btrfs_ordered_extent_put(entry->inode, entry); if (atomic_dec_and_test(&entry->refs)) { + ASSERT(list_empty(&entry->log_list)); + ASSERT(list_empty(&entry->trans_list)); + ASSERT(list_empty(&entry->root_extent_list)); + ASSERT(RB_EMPTY_NODE(&entry->rb_node)); if (entry->inode) btrfs_add_delayed_iput(entry->inode); while (!list_empty(&entry->list)) { @@ -572,6 +583,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, spin_lock_irq(&tree->lock); node = &entry->rb_node; rb_erase(node, &tree->tree); + RB_CLEAR_NODE(node); if (tree->last == node) tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); @@ -844,6 +856,20 @@ out: return entry; } +bool btrfs_have_ordered_extents_in_range(struct inode *inode, + u64 file_offset, + u64 len) +{ + struct btrfs_ordered_extent *oe; + + oe = btrfs_lookup_ordered_range(inode, file_offset, len); + if (oe) { + btrfs_put_ordered_extent(oe); + return true; + } + return false; +} + /* * lookup and return any extent before 'file_offset'. NULL is returned * if none is found diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index e96cd4ccd805..7176cc0fe43f 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -89,9 +89,6 @@ struct btrfs_ordered_extent { /* number of bytes that still need writing */ u64 bytes_left; - /* number of bytes that still need csumming */ - u64 csum_bytes_left; - /* * the end of the ordered extent which is behind it but * didn't update disk_i_size. Please see the comment of @@ -191,6 +188,9 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, u64 file_offset, u64 len); +bool btrfs_have_ordered_extents_in_range(struct inode *inode, + u64 file_offset, + u64 len); int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 3d6546581bb9..d904ee1c5349 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -34,6 +34,7 @@ #include "extent_io.h" #include "qgroup.h" + /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? * - reorganize keys @@ -84,11 +85,42 @@ struct btrfs_qgroup { /* * temp variables for accounting operations + * Refer to qgroup_shared_accouting() for details. */ u64 old_refcnt; u64 new_refcnt; }; +static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, + int mod) +{ + if (qg->old_refcnt < seq) + qg->old_refcnt = seq; + qg->old_refcnt += mod; +} + +static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, + int mod) +{ + if (qg->new_refcnt < seq) + qg->new_refcnt = seq; + qg->new_refcnt += mod; +} + +static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq) +{ + if (qg->old_refcnt < seq) + return 0; + return qg->old_refcnt - seq; +} + +static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq) +{ + if (qg->new_refcnt < seq) + return 0; + return qg->new_refcnt - seq; +} + /* * glue structure to represent the relations between qgroups. */ @@ -344,7 +376,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) qgroup = find_qgroup_rb(fs_info, found_key.offset); if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { - btrfs_err(fs_info, "inconsitent qgroup config"); + btrfs_err(fs_info, "inconsistent qgroup config"); flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } if (!qgroup) { @@ -1115,14 +1147,14 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct ulist *tmp; int ret = 0; - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - return -ENOMEM; - /* Check the level of src and dst first */ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) return -EINVAL; + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + mutex_lock(&fs_info->qgroup_ioctl_lock); quota_root = fs_info->quota_root; if (!quota_root) { @@ -1317,6 +1349,11 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; int ret = 0; + /* Sometimes we would want to clear the limit on this qgroup. + * To meet this requirement, we treat the -1 as a special value + * which tell kernel to clear the limit on this qgroup. + */ + const u64 CLEAR_VALUE = -1; mutex_lock(&fs_info->qgroup_ioctl_lock); quota_root = fs_info->quota_root; @@ -1332,14 +1369,42 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, } spin_lock(&fs_info->qgroup_lock); - if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) - qgroup->max_rfer = limit->max_rfer; - if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) - qgroup->max_excl = limit->max_excl; - if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) - qgroup->rsv_rfer = limit->rsv_rfer; - if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) - qgroup->rsv_excl = limit->rsv_excl; + if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) { + if (limit->max_rfer == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; + limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; + qgroup->max_rfer = 0; + } else { + qgroup->max_rfer = limit->max_rfer; + } + } + if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { + if (limit->max_excl == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; + limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; + qgroup->max_excl = 0; + } else { + qgroup->max_excl = limit->max_excl; + } + } + if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) { + if (limit->rsv_rfer == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; + limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; + qgroup->rsv_rfer = 0; + } else { + qgroup->rsv_rfer = limit->rsv_rfer; + } + } + if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) { + if (limit->rsv_excl == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; + limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; + qgroup->rsv_excl = 0; + } else { + qgroup->rsv_excl = limit->rsv_excl; + } + } qgroup->lim_flags |= limit->flags; spin_unlock(&fs_info->qgroup_lock); @@ -1356,239 +1421,86 @@ out: return ret; } -static int comp_oper_exist(struct btrfs_qgroup_operation *oper1, - struct btrfs_qgroup_operation *oper2) +int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) { - /* - * Ignore seq and type here, we're looking for any operation - * at all related to this extent on that root. - */ - if (oper1->bytenr < oper2->bytenr) - return -1; - if (oper1->bytenr > oper2->bytenr) - return 1; - if (oper1->ref_root < oper2->ref_root) - return -1; - if (oper1->ref_root > oper2->ref_root) - return 1; - return 0; -} + struct btrfs_qgroup_extent_record *record; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *node; + u64 qgroup_to_skip; + int ret = 0; -static int qgroup_oper_exists(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct rb_node *n; - struct btrfs_qgroup_operation *cur; - int cmp; + delayed_refs = &trans->transaction->delayed_refs; + qgroup_to_skip = delayed_refs->qgroup_to_skip; - spin_lock(&fs_info->qgroup_op_lock); - n = fs_info->qgroup_op_tree.rb_node; - while (n) { - cur = rb_entry(n, struct btrfs_qgroup_operation, n); - cmp = comp_oper_exist(cur, oper); - if (cmp < 0) { - n = n->rb_right; - } else if (cmp) { - n = n->rb_left; - } else { - spin_unlock(&fs_info->qgroup_op_lock); - return -EEXIST; - } + /* + * No need to do lock, since this function will only be called in + * btrfs_commmit_transaction(). + */ + node = rb_first(&delayed_refs->dirty_extent_root); + while (node) { + record = rb_entry(node, struct btrfs_qgroup_extent_record, + node); + ret = btrfs_find_all_roots(NULL, fs_info, record->bytenr, 0, + &record->old_roots); + if (ret < 0) + break; + if (qgroup_to_skip) + ulist_del(record->old_roots, qgroup_to_skip, 0); + node = rb_next(node); } - spin_unlock(&fs_info->qgroup_op_lock); - return 0; -} - -static int comp_oper(struct btrfs_qgroup_operation *oper1, - struct btrfs_qgroup_operation *oper2) -{ - if (oper1->bytenr < oper2->bytenr) - return -1; - if (oper1->bytenr > oper2->bytenr) - return 1; - if (oper1->ref_root < oper2->ref_root) - return -1; - if (oper1->ref_root > oper2->ref_root) - return 1; - if (oper1->seq < oper2->seq) - return -1; - if (oper1->seq > oper2->seq) - return 1; - if (oper1->type < oper2->type) - return -1; - if (oper1->type > oper2->type) - return 1; - return 0; + return ret; } -static int insert_qgroup_oper(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) +struct btrfs_qgroup_extent_record +*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record) { - struct rb_node **p; - struct rb_node *parent = NULL; - struct btrfs_qgroup_operation *cur; - int cmp; + struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; + struct rb_node *parent_node = NULL; + struct btrfs_qgroup_extent_record *entry; + u64 bytenr = record->bytenr; - spin_lock(&fs_info->qgroup_op_lock); - p = &fs_info->qgroup_op_tree.rb_node; while (*p) { - parent = *p; - cur = rb_entry(parent, struct btrfs_qgroup_operation, n); - cmp = comp_oper(cur, oper); - if (cmp < 0) { - p = &(*p)->rb_right; - } else if (cmp) { + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, + node); + if (bytenr < entry->bytenr) p = &(*p)->rb_left; - } else { - spin_unlock(&fs_info->qgroup_op_lock); - return -EEXIST; - } - } - rb_link_node(&oper->n, parent, p); - rb_insert_color(&oper->n, &fs_info->qgroup_op_tree); - spin_unlock(&fs_info->qgroup_op_lock); - return 0; -} - -/* - * Record a quota operation for processing later on. - * @trans: the transaction we are adding the delayed op to. - * @fs_info: the fs_info for this fs. - * @ref_root: the root of the reference we are acting on, - * @bytenr: the bytenr we are acting on. - * @num_bytes: the number of bytes in the reference. - * @type: the type of operation this is. - * @mod_seq: do we need to get a sequence number for looking up roots. - * - * We just add it to our trans qgroup_ref_list and carry on and process these - * operations in order at some later point. If the reference root isn't a fs - * root then we don't bother with doing anything. - * - * MUST BE HOLDING THE REF LOCK. - */ -int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 ref_root, - u64 bytenr, u64 num_bytes, - enum btrfs_qgroup_operation_type type, int mod_seq) -{ - struct btrfs_qgroup_operation *oper; - int ret; - - if (!is_fstree(ref_root) || !fs_info->quota_enabled) - return 0; - - oper = kmalloc(sizeof(*oper), GFP_NOFS); - if (!oper) - return -ENOMEM; - - oper->ref_root = ref_root; - oper->bytenr = bytenr; - oper->num_bytes = num_bytes; - oper->type = type; - oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); - INIT_LIST_HEAD(&oper->elem.list); - oper->elem.seq = 0; - - trace_btrfs_qgroup_record_ref(oper); - - if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) { - /* - * If any operation for this bytenr/ref_root combo - * exists, then we know it's not exclusively owned and - * shouldn't be queued up. - * - * This also catches the case where we have a cloned - * extent that gets queued up multiple times during - * drop snapshot. - */ - if (qgroup_oper_exists(fs_info, oper)) { - kfree(oper); - return 0; - } - } - - ret = insert_qgroup_oper(fs_info, oper); - if (ret) { - /* Shouldn't happen so have an assert for developers */ - ASSERT(0); - kfree(oper); - return ret; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return entry; } - list_add_tail(&oper->list, &trans->qgroup_ref_list); - - if (mod_seq) - btrfs_get_tree_mod_seq(fs_info, &oper->elem); - - return 0; -} - -static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct ulist *tmp; - int sign = 0; - int ret = 0; - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - return -ENOMEM; - - spin_lock(&fs_info->qgroup_lock); - if (!fs_info->quota_root) - goto out; - - switch (oper->type) { - case BTRFS_QGROUP_OPER_ADD_EXCL: - sign = 1; - break; - case BTRFS_QGROUP_OPER_SUB_EXCL: - sign = -1; - break; - default: - ASSERT(0); - } - ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root, - oper->num_bytes, sign); -out: - spin_unlock(&fs_info->qgroup_lock); - ulist_free(tmp); - return ret; + rb_link_node(&record->node, parent_node, p); + rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); + return NULL; } +#define UPDATE_NEW 0 +#define UPDATE_OLD 1 /* - * Walk all of the roots that pointed to our bytenr and adjust their refcnts as - * properly. + * Walk all of the roots that points to the bytenr and adjust their refcnts. */ -static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, - u64 root_to_skip, struct ulist *tmp, - struct ulist *roots, struct ulist *qgroups, - u64 seq, int *old_roots, int rescan) +static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, + struct ulist *roots, struct ulist *tmp, + struct ulist *qgroups, u64 seq, int update_old) { struct ulist_node *unode; struct ulist_iterator uiter; struct ulist_node *tmp_unode; struct ulist_iterator tmp_uiter; struct btrfs_qgroup *qg; - int ret; + int ret = 0; + if (!roots) + return 0; ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(roots, &uiter))) { - /* We don't count our current root here */ - if (unode->val == root_to_skip) - continue; qg = find_qgroup_rb(fs_info, unode->val); if (!qg) continue; - /* - * We could have a pending removal of this same ref so we may - * not have actually found our ref root when doing - * btrfs_find_all_roots, so we need to keep track of how many - * old roots we find in case we removed ours and added a - * different one at the same time. I don't think this could - * happen in practice but that sort of thinking leads to pain - * and suffering and to the dark side. - */ - (*old_roots)++; ulist_reinit(tmp); ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), @@ -1603,29 +1515,10 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_list *glist; qg = u64_to_ptr(tmp_unode->aux); - /* - * We use this sequence number to keep from having to - * run the whole list and 0 out the refcnt every time. - * We basically use sequnce as the known 0 count and - * then add 1 everytime we see a qgroup. This is how we - * get how many of the roots actually point up to the - * upper level qgroups in order to determine exclusive - * counts. - * - * For rescan we want to set old_refcnt to seq so our - * exclusive calculations end up correct. - */ - if (rescan) - qg->old_refcnt = seq; - else if (qg->old_refcnt < seq) - qg->old_refcnt = seq + 1; + if (update_old) + btrfs_qgroup_update_old_refcnt(qg, seq, 1); else - qg->old_refcnt++; - - if (qg->new_refcnt < seq) - qg->new_refcnt = seq + 1; - else - qg->new_refcnt++; + btrfs_qgroup_update_new_refcnt(qg, seq, 1); list_for_each_entry(glist, &qg->groups, next_group) { ret = ulist_add(qgroups, glist->group->qgroupid, ptr_to_u64(glist->group), @@ -1644,161 +1537,46 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, } /* - * We need to walk forward in our operation tree and account for any roots that - * were deleted after we made this operation. - */ -static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper, - struct ulist *tmp, - struct ulist *qgroups, u64 seq, - int *old_roots) -{ - struct ulist_node *unode; - struct ulist_iterator uiter; - struct btrfs_qgroup *qg; - struct btrfs_qgroup_operation *tmp_oper; - struct rb_node *n; - int ret; - - ulist_reinit(tmp); - - /* - * We only walk forward in the tree since we're only interested in - * removals that happened _after_ our operation. - */ - spin_lock(&fs_info->qgroup_op_lock); - n = rb_next(&oper->n); - spin_unlock(&fs_info->qgroup_op_lock); - if (!n) - return 0; - tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); - while (tmp_oper->bytenr == oper->bytenr) { - /* - * If it's not a removal we don't care, additions work out - * properly with our refcnt tracking. - */ - if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED && - tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL) - goto next; - qg = find_qgroup_rb(fs_info, tmp_oper->ref_root); - if (!qg) - goto next; - ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), - GFP_ATOMIC); - if (ret) { - if (ret < 0) - return ret; - /* - * We only want to increase old_roots if this qgroup is - * not already in the list of qgroups. If it is already - * there then that means it must have been re-added or - * the delete will be discarded because we had an - * existing ref that we haven't looked up yet. In this - * case we don't want to increase old_roots. So if ret - * == 1 then we know that this is the first time we've - * seen this qgroup and we can bump the old_roots. - */ - (*old_roots)++; - ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), - GFP_ATOMIC); - if (ret < 0) - return ret; - } -next: - spin_lock(&fs_info->qgroup_op_lock); - n = rb_next(&tmp_oper->n); - spin_unlock(&fs_info->qgroup_op_lock); - if (!n) - break; - tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); - } - - /* Ok now process the qgroups we found */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - struct btrfs_qgroup_list *glist; - - qg = u64_to_ptr(unode->aux); - if (qg->old_refcnt < seq) - qg->old_refcnt = seq + 1; - else - qg->old_refcnt++; - if (qg->new_refcnt < seq) - qg->new_refcnt = seq + 1; - else - qg->new_refcnt++; - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(qgroups, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - } - } - return 0; -} - -/* Add refcnt for the newly added reference. */ -static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper, - struct btrfs_qgroup *qgroup, - struct ulist *tmp, struct ulist *qgroups, - u64 seq) -{ - struct ulist_node *unode; - struct ulist_iterator uiter; - struct btrfs_qgroup *qg; - int ret; - - ulist_reinit(tmp); - ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup), - GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup), - GFP_ATOMIC); - if (ret < 0) - return ret; - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - struct btrfs_qgroup_list *glist; - - qg = u64_to_ptr(unode->aux); - if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { - if (qg->new_refcnt < seq) - qg->new_refcnt = seq + 1; - else - qg->new_refcnt++; - } else { - if (qg->old_refcnt < seq) - qg->old_refcnt = seq + 1; - else - qg->old_refcnt++; - } - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(qgroups, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - } - } - return 0; -} - -/* - * This adjusts the counters for all referenced qgroups if need be. + * Update qgroup rfer/excl counters. + * Rfer update is easy, codes can explain themselves. + * + * Excl update is tricky, the update is split into 2 part. + * Part 1: Possible exclusive <-> sharing detect: + * | A | !A | + * ------------------------------------- + * B | * | - | + * ------------------------------------- + * !B | + | ** | + * ------------------------------------- + * + * Conditions: + * A: cur_old_roots < nr_old_roots (not exclusive before) + * !A: cur_old_roots == nr_old_roots (possible exclusive before) + * B: cur_new_roots < nr_new_roots (not exclusive now) + * !B: cur_new_roots == nr_new_roots (possible exclsuive now) + * + * Results: + * +: Possible sharing -> exclusive -: Possible exclusive -> sharing + * *: Definitely not changed. **: Possible unchanged. + * + * For !A and !B condition, the exception is cur_old/new_roots == 0 case. + * + * To make the logic clear, we first use condition A and B to split + * combination into 4 results. + * + * Then, for result "+" and "-", check old/new_roots == 0 case, as in them + * only on variant maybe 0. + * + * Lastly, check result **, since there are 2 variants maybe 0, split them + * again(2x2). + * But this time we don't need to consider other things, the codes and logic + * is easy to understand now. */ -static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, - u64 root_to_skip, u64 num_bytes, - struct ulist *qgroups, u64 seq, - int old_roots, int new_roots, int rescan) +static int qgroup_update_counters(struct btrfs_fs_info *fs_info, + struct ulist *qgroups, + u64 nr_old_roots, + u64 nr_new_roots, + u64 num_bytes, u64 seq) { struct ulist_node *unode; struct ulist_iterator uiter; @@ -1810,423 +1588,196 @@ static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, bool dirty = false; qg = u64_to_ptr(unode->aux); - /* - * Wasn't referenced before but is now, add to the reference - * counters. - */ - if (qg->old_refcnt <= seq && qg->new_refcnt > seq) { + cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); + cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); + + /* Rfer update part */ + if (cur_old_count == 0 && cur_new_count > 0) { qg->rfer += num_bytes; qg->rfer_cmpr += num_bytes; dirty = true; } - - /* - * Was referenced before but isn't now, subtract from the - * reference counters. - */ - if (qg->old_refcnt > seq && qg->new_refcnt <= seq) { + if (cur_old_count > 0 && cur_new_count == 0) { qg->rfer -= num_bytes; qg->rfer_cmpr -= num_bytes; dirty = true; } - if (qg->old_refcnt < seq) - cur_old_count = 0; - else - cur_old_count = qg->old_refcnt - seq; - if (qg->new_refcnt < seq) - cur_new_count = 0; - else - cur_new_count = qg->new_refcnt - seq; + /* Excl update part */ + /* Exclusive/none -> shared case */ + if (cur_old_count == nr_old_roots && + cur_new_count < nr_new_roots) { + /* Exclusive -> shared */ + if (cur_old_count != 0) { + qg->excl -= num_bytes; + qg->excl_cmpr -= num_bytes; + dirty = true; + } + } - /* - * If our refcount was the same as the roots previously but our - * new count isn't the same as the number of roots now then we - * went from having a exclusive reference on this range to not. - */ - if (old_roots && cur_old_count == old_roots && - (cur_new_count != new_roots || new_roots == 0)) { - WARN_ON(cur_new_count != new_roots && new_roots == 0); - qg->excl -= num_bytes; - qg->excl_cmpr -= num_bytes; - dirty = true; + /* Shared -> exclusive/none case */ + if (cur_old_count < nr_old_roots && + cur_new_count == nr_new_roots) { + /* Shared->exclusive */ + if (cur_new_count != 0) { + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; + dirty = true; + } } - /* - * If we didn't reference all the roots before but now we do we - * have an exclusive reference to this range. - */ - if ((!old_roots || (old_roots && cur_old_count != old_roots)) - && cur_new_count == new_roots) { - qg->excl += num_bytes; - qg->excl_cmpr += num_bytes; - dirty = true; + /* Exclusive/none -> exclusive/none case */ + if (cur_old_count == nr_old_roots && + cur_new_count == nr_new_roots) { + if (cur_old_count == 0) { + /* None -> exclusive/none */ + + if (cur_new_count != 0) { + /* None -> exclusive */ + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; + dirty = true; + } + /* None -> none, nothing changed */ + } else { + /* Exclusive -> exclusive/none */ + + if (cur_new_count == 0) { + /* Exclusive -> none */ + qg->excl -= num_bytes; + qg->excl_cmpr -= num_bytes; + dirty = true; + } + /* Exclusive -> exclusive, nothing changed */ + } } + /* For exclusive extent, free its reserved bytes too */ + if (nr_old_roots == 0 && nr_new_roots == 1 && + cur_new_count == nr_new_roots) + qg->reserved -= num_bytes; if (dirty) qgroup_dirty(fs_info, qg); } return 0; } -/* - * If we removed a data extent and there were other references for that bytenr - * then we need to lookup all referenced roots to make sure we still don't - * reference this bytenr. If we do then we can just discard this operation. - */ -static int check_existing_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct ulist *roots = NULL; - struct ulist_node *unode; - struct ulist_iterator uiter; - int ret = 0; - - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, - oper->elem.seq, &roots); - if (ret < 0) - return ret; - ret = 0; - - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(roots, &uiter))) { - if (unode->val == oper->ref_root) { - ret = 1; - break; - } - } - ulist_free(roots); - btrfs_put_tree_mod_seq(fs_info, &oper->elem); - - return ret; -} - -/* - * If we share a reference across multiple roots then we may need to adjust - * various qgroups referenced and exclusive counters. The basic premise is this - * - * 1) We have seq to represent a 0 count. Instead of looping through all of the - * qgroups and resetting their refcount to 0 we just constantly bump this - * sequence number to act as the base reference count. This means that if - * anybody is equal to or below this sequence they were never referenced. We - * jack this sequence up by the number of roots we found each time in order to - * make sure we don't have any overlap. - * - * 2) We first search all the roots that reference the area _except_ the root - * we're acting on currently. This makes up the old_refcnt of all the qgroups - * before. - * - * 3) We walk all of the qgroups referenced by the root we are currently acting - * on, and will either adjust old_refcnt in the case of a removal or the - * new_refcnt in the case of an addition. - * - * 4) Finally we walk all the qgroups that are referenced by this range - * including the root we are acting on currently. We will adjust the counters - * based on the number of roots we had and will have after this operation. - * - * Take this example as an illustration - * - * [qgroup 1/0] - * / | \ - * [qg 0/0] [qg 0/1] [qg 0/2] - * \ | / - * [ extent ] - * - * Say we are adding a reference that is covered by qg 0/0. The first step - * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with - * old_roots being 2. Because it is adding new_roots will be 1. We then go - * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's - * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we - * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a - * reference and thus must add the size to the referenced bytes. Everything - * else is the same so nothing else changes. - */ -static int qgroup_shared_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) +int +btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 bytenr, u64 num_bytes, + struct ulist *old_roots, struct ulist *new_roots) { - struct ulist *roots = NULL; - struct ulist *qgroups, *tmp; - struct btrfs_qgroup *qgroup; - struct seq_list elem = SEQ_LIST_INIT(elem); + struct ulist *qgroups = NULL; + struct ulist *tmp = NULL; u64 seq; - int old_roots = 0; - int new_roots = 0; + u64 nr_new_roots = 0; + u64 nr_old_roots = 0; int ret = 0; - if (oper->elem.seq) { - ret = check_existing_refs(trans, fs_info, oper); - if (ret < 0) - return ret; - if (ret) - return 0; - } + if (new_roots) + nr_new_roots = new_roots->nnodes; + if (old_roots) + nr_old_roots = old_roots->nnodes; - qgroups = ulist_alloc(GFP_NOFS); - if (!qgroups) - return -ENOMEM; + if (!fs_info->quota_enabled) + goto out_free; + BUG_ON(!fs_info->quota_root); + qgroups = ulist_alloc(GFP_NOFS); + if (!qgroups) { + ret = -ENOMEM; + goto out_free; + } tmp = ulist_alloc(GFP_NOFS); if (!tmp) { - ulist_free(qgroups); - return -ENOMEM; + ret = -ENOMEM; + goto out_free; } - btrfs_get_tree_mod_seq(fs_info, &elem); - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq, - &roots); - btrfs_put_tree_mod_seq(fs_info, &elem); - if (ret < 0) { - ulist_free(qgroups); - ulist_free(tmp); - return ret; + mutex_lock(&fs_info->qgroup_rescan_lock); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { + mutex_unlock(&fs_info->qgroup_rescan_lock); + ret = 0; + goto out_free; + } } + mutex_unlock(&fs_info->qgroup_rescan_lock); + spin_lock(&fs_info->qgroup_lock); - qgroup = find_qgroup_rb(fs_info, oper->ref_root); - if (!qgroup) - goto out; seq = fs_info->qgroup_seq; - /* - * So roots is the list of all the roots currently pointing at the - * bytenr, including the ref we are adding if we are adding, or not if - * we are removing a ref. So we pass in the ref_root to skip that root - * in our calculations. We set old_refnct and new_refcnt cause who the - * hell knows what everything looked like before, and it doesn't matter - * except... - */ - ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups, - seq, &old_roots, 0); - if (ret < 0) - goto out; - - /* - * Now adjust the refcounts of the qgroups that care about this - * reference, either the old_count in the case of removal or new_count - * in the case of an addition. - */ - ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups, - seq); + /* Update old refcnts using old_roots */ + ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq, + UPDATE_OLD); if (ret < 0) goto out; - /* - * ...in the case of removals. If we had a removal before we got around - * to processing this operation then we need to find that guy and count - * his references as if they really existed so we don't end up screwing - * up the exclusive counts. Then whenever we go to process the delete - * everything will be grand and we can account for whatever exclusive - * changes need to be made there. We also have to pass in old_roots so - * we have an accurate count of the roots as it pertains to this - * operations view of the world. - */ - ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq, - &old_roots); + /* Update new refcnts using new_roots */ + ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq, + UPDATE_NEW); if (ret < 0) goto out; - /* - * We are adding our root, need to adjust up the number of roots, - * otherwise old_roots is the number of roots we want. - */ - if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { - new_roots = old_roots + 1; - } else { - new_roots = old_roots; - old_roots++; - } - fs_info->qgroup_seq += old_roots + 1; - + qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots, + num_bytes, seq); /* - * And now the magic happens, bless Arne for having a pretty elegant - * solution for this. + * Bump qgroup_seq to avoid seq overlap */ - qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes, - qgroups, seq, old_roots, new_roots, 0); + fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; out: spin_unlock(&fs_info->qgroup_lock); - ulist_free(qgroups); - ulist_free(roots); +out_free: ulist_free(tmp); + ulist_free(qgroups); + ulist_free(old_roots); + ulist_free(new_roots); return ret; } -/* - * Process a reference to a shared subtree. This type of operation is - * queued during snapshot removal when we encounter extents which are - * shared between more than one root. - */ -static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct ulist *roots = NULL; - struct ulist_node *unode; - struct ulist_iterator uiter; - struct btrfs_qgroup_list *glist; - struct ulist *parents; - int ret = 0; - int err; - struct btrfs_qgroup *qg; - u64 root_obj = 0; - struct seq_list elem = SEQ_LIST_INIT(elem); - - parents = ulist_alloc(GFP_NOFS); - if (!parents) - return -ENOMEM; - - btrfs_get_tree_mod_seq(fs_info, &elem); - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, - elem.seq, &roots); - btrfs_put_tree_mod_seq(fs_info, &elem); - if (ret < 0) - goto out; - - if (roots->nnodes != 1) - goto out; - - ULIST_ITER_INIT(&uiter); - unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */ - /* - * If we find our ref root then that means all refs - * this extent has to the root have not yet been - * deleted. In that case, we do nothing and let the - * last ref for this bytenr drive our update. - * - * This can happen for example if an extent is - * referenced multiple times in a snapshot (clone, - * etc). If we are in the middle of snapshot removal, - * queued updates for such an extent will find the - * root if we have not yet finished removing the - * snapshot. - */ - if (unode->val == oper->ref_root) - goto out; - - root_obj = unode->val; - BUG_ON(!root_obj); - - spin_lock(&fs_info->qgroup_lock); - qg = find_qgroup_rb(fs_info, root_obj); - if (!qg) - goto out_unlock; - - qg->excl += oper->num_bytes; - qg->excl_cmpr += oper->num_bytes; - qgroup_dirty(fs_info, qg); - - /* - * Adjust counts for parent groups. First we find all - * parents, then in the 2nd loop we do the adjustment - * while adding parents of the parents to our ulist. - */ - list_for_each_entry(glist, &qg->groups, next_group) { - err = ulist_add(parents, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (err < 0) { - ret = err; - goto out_unlock; - } - } - - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(parents, &uiter))) { - qg = u64_to_ptr(unode->aux); - qg->excl += oper->num_bytes; - qg->excl_cmpr += oper->num_bytes; - qgroup_dirty(fs_info, qg); - - /* Add any parents of the parents */ - list_for_each_entry(glist, &qg->groups, next_group) { - err = ulist_add(parents, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (err < 0) { - ret = err; - goto out_unlock; - } - } - } - -out_unlock: - spin_unlock(&fs_info->qgroup_lock); - -out: - ulist_free(roots); - ulist_free(parents); - return ret; -} - -/* - * btrfs_qgroup_account_ref is called for every ref that is added to or deleted - * from the fs. First, all roots referencing the extent are searched, and - * then the space is accounted accordingly to the different roots. The - * accounting algorithm works in 3 steps documented inline. - */ -static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) +int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) { + struct btrfs_qgroup_extent_record *record; + struct btrfs_delayed_ref_root *delayed_refs; + struct ulist *new_roots = NULL; + struct rb_node *node; + u64 qgroup_to_skip; int ret = 0; - if (!fs_info->quota_enabled) - return 0; - - BUG_ON(!fs_info->quota_root); + delayed_refs = &trans->transaction->delayed_refs; + qgroup_to_skip = delayed_refs->qgroup_to_skip; + while ((node = rb_first(&delayed_refs->dirty_extent_root))) { + record = rb_entry(node, struct btrfs_qgroup_extent_record, + node); - mutex_lock(&fs_info->qgroup_rescan_lock); - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { - if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) { - mutex_unlock(&fs_info->qgroup_rescan_lock); - return 0; + if (!ret) { + /* + * Use (u64)-1 as time_seq to do special search, which + * doesn't lock tree or delayed_refs and search current + * root. It's safe inside commit_transaction(). + */ + ret = btrfs_find_all_roots(trans, fs_info, + record->bytenr, (u64)-1, &new_roots); + if (ret < 0) + goto cleanup; + if (qgroup_to_skip) + ulist_del(new_roots, qgroup_to_skip, 0); + ret = btrfs_qgroup_account_extent(trans, fs_info, + record->bytenr, record->num_bytes, + record->old_roots, new_roots); + record->old_roots = NULL; + new_roots = NULL; } - } - mutex_unlock(&fs_info->qgroup_rescan_lock); - - ASSERT(is_fstree(oper->ref_root)); - - trace_btrfs_qgroup_account(oper); - - switch (oper->type) { - case BTRFS_QGROUP_OPER_ADD_EXCL: - case BTRFS_QGROUP_OPER_SUB_EXCL: - ret = qgroup_excl_accounting(fs_info, oper); - break; - case BTRFS_QGROUP_OPER_ADD_SHARED: - case BTRFS_QGROUP_OPER_SUB_SHARED: - ret = qgroup_shared_accounting(trans, fs_info, oper); - break; - case BTRFS_QGROUP_OPER_SUB_SUBTREE: - ret = qgroup_subtree_accounting(trans, fs_info, oper); - break; - default: - ASSERT(0); - } - return ret; -} - -/* - * Needs to be called everytime we run delayed refs, even if there is an error - * in order to cleanup outstanding operations. - */ -int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct btrfs_qgroup_operation *oper; - int ret = 0; +cleanup: + ulist_free(record->old_roots); + ulist_free(new_roots); + new_roots = NULL; + rb_erase(node, &delayed_refs->dirty_extent_root); + kfree(record); - while (!list_empty(&trans->qgroup_ref_list)) { - oper = list_first_entry(&trans->qgroup_ref_list, - struct btrfs_qgroup_operation, list); - list_del_init(&oper->list); - if (!ret || !trans->aborted) - ret = btrfs_qgroup_account(trans, fs_info, oper); - spin_lock(&fs_info->qgroup_op_lock); - rb_erase(&oper->n, &fs_info->qgroup_op_tree); - spin_unlock(&fs_info->qgroup_op_lock); - btrfs_put_tree_mod_seq(fs_info, &oper->elem); - kfree(oper); } return ret; } @@ -2637,15 +2188,13 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) */ static int qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_trans_handle *trans, struct ulist *qgroups, - struct ulist *tmp, struct extent_buffer *scratch_leaf) + struct btrfs_trans_handle *trans, + struct extent_buffer *scratch_leaf) { struct btrfs_key found; struct ulist *roots = NULL; struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); u64 num_bytes; - u64 seq; - int new_roots; int slot; int ret; @@ -2695,33 +2244,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, else num_bytes = found.offset; - ulist_reinit(qgroups); ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, &roots); if (ret < 0) goto out; - spin_lock(&fs_info->qgroup_lock); - seq = fs_info->qgroup_seq; - fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ - - new_roots = 0; - ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups, - seq, &new_roots, 1); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); - goto out; - } - - ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups, - seq, 0, new_roots, 1); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); + /* For rescan, just pass old_roots as NULL */ + ret = btrfs_qgroup_account_extent(trans, fs_info, + found.objectid, num_bytes, NULL, roots); + if (ret < 0) goto out; - } - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); } out: btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); @@ -2735,7 +2266,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; - struct ulist *tmp = NULL, *qgroups = NULL; struct extent_buffer *scratch_leaf = NULL; int err = -ENOMEM; int ret = 0; @@ -2743,12 +2273,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) path = btrfs_alloc_path(); if (!path) goto out; - qgroups = ulist_alloc(GFP_NOFS); - if (!qgroups) - goto out; - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - goto out; scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS); if (!scratch_leaf) goto out; @@ -2764,7 +2288,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) err = -EINTR; } else { err = qgroup_rescan_leaf(fs_info, path, trans, - qgroups, tmp, scratch_leaf); + scratch_leaf); } if (err > 0) btrfs_commit_transaction(trans, fs_info->fs_root); @@ -2774,8 +2298,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) out: kfree(scratch_leaf); - ulist_free(qgroups); - ulist_free(tmp); btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index c5242aa9a4b2..6387dcfa354c 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -19,43 +19,18 @@ #ifndef __BTRFS_QGROUP__ #define __BTRFS_QGROUP__ +#include "ulist.h" +#include "delayed-ref.h" + /* - * A description of the operations, all of these operations only happen when we - * are adding the 1st reference for that subvolume in the case of adding space - * or on the last reference delete in the case of subtraction. The only - * exception is the last one, which is added for confusion. - * - * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only - * one pointing at the bytes we are adding. This is called on the first - * allocation. - * - * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be - * shared between subvols. This is called on the creation of a ref that already - * has refs from a different subvolume, so basically reflink. - * - * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only - * one referencing the range. - * - * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with - * refs with other subvolumes. + * Record a dirty extent, and info qgroup to update quota on it + * TODO: Use kmem cache to alloc it. */ -enum btrfs_qgroup_operation_type { - BTRFS_QGROUP_OPER_ADD_EXCL, - BTRFS_QGROUP_OPER_ADD_SHARED, - BTRFS_QGROUP_OPER_SUB_EXCL, - BTRFS_QGROUP_OPER_SUB_SHARED, - BTRFS_QGROUP_OPER_SUB_SUBTREE, -}; - -struct btrfs_qgroup_operation { - u64 ref_root; +struct btrfs_qgroup_extent_record { + struct rb_node node; u64 bytenr; u64 num_bytes; - u64 seq; - enum btrfs_qgroup_operation_type type; - struct seq_list elem; - struct rb_node n; - struct list_head list; + struct ulist *old_roots; }; int btrfs_quota_enable(struct btrfs_trans_handle *trans, @@ -79,16 +54,18 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; -int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 ref_root, +int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +struct btrfs_qgroup_extent_record +*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record); +int +btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, - enum btrfs_qgroup_operation_type type, - int mod_seq); -int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper); + struct ulist *old_roots, struct ulist *new_roots); +int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); int btrfs_run_qgroups(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index fa72068bd256..fcf7265ca46f 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -61,9 +61,10 @@ #define RBIO_CACHE_SIZE 1024 enum btrfs_rbio_ops { - BTRFS_RBIO_WRITE = 0, - BTRFS_RBIO_READ_REBUILD = 1, - BTRFS_RBIO_PARITY_SCRUB = 2, + BTRFS_RBIO_WRITE, + BTRFS_RBIO_READ_REBUILD, + BTRFS_RBIO_PARITY_SCRUB, + BTRFS_RBIO_REBUILD_MISSING, }; struct btrfs_raid_bio { @@ -602,6 +603,10 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, cur->operation == BTRFS_RBIO_PARITY_SCRUB) return 0; + if (last->operation == BTRFS_RBIO_REBUILD_MISSING || + cur->operation == BTRFS_RBIO_REBUILD_MISSING) + return 0; + return 1; } @@ -793,7 +798,10 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) if (next->operation == BTRFS_RBIO_READ_REBUILD) async_read_rebuild(next); - else if (next->operation == BTRFS_RBIO_WRITE) { + else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { + steal_rbio(rbio, next); + async_read_rebuild(next); + } else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); async_rmw_stripe(next); } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { @@ -851,7 +859,7 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio) * this frees the rbio and runs through all the bios in the * bio_list and calls end_io on them */ -static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) { struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *next; @@ -864,9 +872,8 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) while (cur) { next = cur->bi_next; cur->bi_next = NULL; - if (uptodate) - set_bit(BIO_UPTODATE, &cur->bi_flags); - bio_endio(cur, err); + cur->bi_error = err; + bio_endio(cur); cur = next; } } @@ -875,9 +882,10 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) * end io function used by finish_rmw. When we finally * get here, we've written a full stripe */ -static void raid_write_end_io(struct bio *bio, int err) +static void raid_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; + int err = bio->bi_error; if (err) fail_bio_stripe(rbio, bio); @@ -893,7 +901,7 @@ static void raid_write_end_io(struct bio *bio, int err) if (atomic_read(&rbio->error) > rbio->bbio->max_errors) err = -EIO; - rbio_orig_end_io(rbio, err, 0); + rbio_orig_end_io(rbio, err); return; } @@ -1071,7 +1079,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, * devices or if they are not contiguous */ if (last_end == disk_start && stripe->dev->bdev && - test_bit(BIO_UPTODATE, &last->bi_flags) && + !last->bi_error && last->bi_bdev == stripe->dev->bdev) { ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); if (ret == PAGE_CACHE_SIZE) @@ -1087,7 +1095,6 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, bio->bi_iter.bi_size = 0; bio->bi_bdev = stripe->dev->bdev; bio->bi_iter.bi_sector = disk_start >> 9; - set_bit(BIO_UPTODATE, &bio->bi_flags); bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); bio_list_add(bio_list, bio); @@ -1312,13 +1319,12 @@ write_data: bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); submit_bio(WRITE, bio); } return; cleanup: - rbio_orig_end_io(rbio, -EIO, 0); + rbio_orig_end_io(rbio, -EIO); } /* @@ -1441,11 +1447,11 @@ static void set_bio_pages_uptodate(struct bio *bio) * This will usually kick off finish_rmw once all the bios are read in, but it * may trigger parity reconstruction if we had any errors along the way */ -static void raid_rmw_end_io(struct bio *bio, int err) +static void raid_rmw_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (err) + if (bio->bi_error) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -1455,7 +1461,6 @@ static void raid_rmw_end_io(struct bio *bio, int err) if (!atomic_dec_and_test(&rbio->stripes_pending)) return; - err = 0; if (atomic_read(&rbio->error) > rbio->bbio->max_errors) goto cleanup; @@ -1469,7 +1474,7 @@ static void raid_rmw_end_io(struct bio *bio, int err) cleanup: - rbio_orig_end_io(rbio, -EIO, 0); + rbio_orig_end_io(rbio, -EIO); } static void async_rmw_stripe(struct btrfs_raid_bio *rbio) @@ -1572,14 +1577,13 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); submit_bio(READ, bio); } /* the actual write will happen once the reads are done */ return 0; cleanup: - rbio_orig_end_io(rbio, -EIO, 0); + rbio_orig_end_io(rbio, -EIO); return -EIO; finish: @@ -1809,7 +1813,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) faila = rbio->faila; failb = rbio->failb; - if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { + if (rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { spin_lock_irq(&rbio->bio_list_lock); set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); spin_unlock_irq(&rbio->bio_list_lock); @@ -1834,7 +1839,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) * if we're rebuilding a read, we have to use * pages from the bio list */ - if (rbio->operation == BTRFS_RBIO_READ_REBUILD && + if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && (stripe == faila || stripe == failb)) { page = page_in_rbio(rbio, stripe, pagenr, 0); } else { @@ -1943,7 +1949,8 @@ pstripe: * if we're rebuilding a read, we have to use * pages from the bio list */ - if (rbio->operation == BTRFS_RBIO_READ_REBUILD && + if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && (stripe == faila || stripe == failb)) { page = page_in_rbio(rbio, stripe, pagenr, 0); } else { @@ -1964,7 +1971,9 @@ cleanup_io: else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - rbio_orig_end_io(rbio, err, err == 0); + rbio_orig_end_io(rbio, err); + } else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { + rbio_orig_end_io(rbio, err); } else if (err == 0) { rbio->faila = -1; rbio->failb = -1; @@ -1976,7 +1985,7 @@ cleanup_io: else BUG(); } else { - rbio_orig_end_io(rbio, err, 0); + rbio_orig_end_io(rbio, err); } } @@ -1984,7 +1993,7 @@ cleanup_io: * This is called only for stripes we've read from disk to * reconstruct the parity. */ -static void raid_recover_end_io(struct bio *bio, int err) +static void raid_recover_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; @@ -1992,7 +2001,7 @@ static void raid_recover_end_io(struct bio *bio, int err) * we only read stripe pages off the disk, set them * up to date if there were no errors */ - if (err) + if (bio->bi_error) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -2002,7 +2011,7 @@ static void raid_recover_end_io(struct bio *bio, int err) return; if (atomic_read(&rbio->error) > rbio->bbio->max_errors) - rbio_orig_end_io(rbio, -EIO, 0); + rbio_orig_end_io(rbio, -EIO); else __raid_recover_end_io(rbio); } @@ -2094,15 +2103,15 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); submit_bio(READ, bio); } out: return 0; cleanup: - if (rbio->operation == BTRFS_RBIO_READ_REBUILD) - rbio_orig_end_io(rbio, -EIO, 0); + if (rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING) + rbio_orig_end_io(rbio, -EIO); return -EIO; } @@ -2232,8 +2241,9 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, return rbio; } -void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, - struct page *page, u64 logical) +/* Used for both parity scrub and missing. */ +void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, + u64 logical) { int stripe_offset; int index; @@ -2277,11 +2287,12 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) * end io function used by finish_rmw. When we finally * get here, we've written a full stripe */ -static void raid_write_parity_end_io(struct bio *bio, int err) +static void raid_write_parity_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; + int err = bio->bi_error; - if (err) + if (bio->bi_error) fail_bio_stripe(rbio, bio); bio_put(bio); @@ -2294,7 +2305,7 @@ static void raid_write_parity_end_io(struct bio *bio, int err) if (atomic_read(&rbio->error)) err = -EIO; - rbio_orig_end_io(rbio, err, 0); + rbio_orig_end_io(rbio, err); } static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, @@ -2437,7 +2448,7 @@ submit_write: nr_data = bio_list_size(&bio_list); if (!nr_data) { /* Every parity is right */ - rbio_orig_end_io(rbio, 0, 0); + rbio_orig_end_io(rbio, 0); return; } @@ -2450,13 +2461,12 @@ submit_write: bio->bi_private = rbio; bio->bi_end_io = raid_write_parity_end_io; - BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); submit_bio(WRITE, bio); } return; cleanup: - rbio_orig_end_io(rbio, -EIO, 0); + rbio_orig_end_io(rbio, -EIO); } static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) @@ -2524,7 +2534,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) return; cleanup: - rbio_orig_end_io(rbio, -EIO, 0); + rbio_orig_end_io(rbio, -EIO); } /* @@ -2535,11 +2545,11 @@ cleanup: * This will usually kick off finish_rmw once all the bios are read in, but it * may trigger parity reconstruction if we had any errors along the way */ -static void raid56_parity_scrub_end_io(struct bio *bio, int err) +static void raid56_parity_scrub_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (err) + if (bio->bi_error) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -2632,14 +2642,13 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); submit_bio(READ, bio); } /* the actual write will happen once the reads are done */ return; cleanup: - rbio_orig_end_io(rbio, -EIO, 0); + rbio_orig_end_io(rbio, -EIO); return; finish: @@ -2668,3 +2677,55 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) if (!lock_stripe_add(rbio)) async_scrub_parity(rbio); } + +/* The following code is used for dev replace of a missing RAID 5/6 device. */ + +struct btrfs_raid_bio * +raid56_alloc_missing_rbio(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 length) +{ + struct btrfs_raid_bio *rbio; + + rbio = alloc_rbio(root, bbio, length); + if (IS_ERR(rbio)) + return NULL; + + rbio->operation = BTRFS_RBIO_REBUILD_MISSING; + bio_list_add(&rbio->bio_list, bio); + /* + * This is a special bio which is used to hold the completion handler + * and make the scrub rbio is similar to the other types + */ + ASSERT(!bio->bi_iter.bi_size); + + rbio->faila = find_logical_bio_stripe(rbio, bio); + if (rbio->faila == -1) { + BUG(); + kfree(rbio); + return NULL; + } + + return rbio; +} + +static void missing_raid56_work(struct btrfs_work *work) +{ + struct btrfs_raid_bio *rbio; + + rbio = container_of(work, struct btrfs_raid_bio, work); + __raid56_parity_recover(rbio); +} + +static void async_missing_raid56(struct btrfs_raid_bio *rbio) +{ + btrfs_init_work(&rbio->work, btrfs_rmw_helper, + missing_raid56_work, NULL, NULL); + + btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); +} + +void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) +{ + if (!lock_stripe_add(rbio)) + async_missing_raid56(rbio); +} diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 2b5d7977d83b..8b694699d502 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -48,15 +48,21 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, int raid56_parity_write(struct btrfs_root *root, struct bio *bio, struct btrfs_bio *bbio, u64 stripe_len); +void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, + u64 logical); + struct btrfs_raid_bio * raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, struct btrfs_bio *bbio, u64 stripe_len, struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors); -void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, - struct page *page, u64 logical); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); +struct btrfs_raid_bio * +raid56_alloc_missing_rbio(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 length); +void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); + int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); #endif diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 0e7beea92b4c..4645cd16d5ba 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -328,6 +328,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, struct btrfs_device *prev_dev; u32 blocksize; u64 length; + int real_stripes; int nzones = 0; int i; unsigned long index = logical >> PAGE_CACHE_SHIFT; @@ -369,7 +370,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, goto error; } - for (nzones = 0; nzones < bbio->num_stripes; ++nzones) { + real_stripes = bbio->num_stripes - bbio->num_tgtdevs; + for (nzones = 0; nzones < real_stripes; ++nzones) { struct reada_zone *zone; dev = bbio->stripes[nzones].dev; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 74b24b01d574..303babeef505 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1847,8 +1847,10 @@ again: } eb = read_tree_block(dest, old_bytenr, old_ptr_gen); - if (!eb || !extent_buffer_uptodate(eb)) { - ret = (!eb) ? -ENOMEM : -EIO; + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { + ret = -EIO; free_extent_buffer(eb); break; } @@ -2002,7 +2004,9 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, bytenr = btrfs_node_blockptr(eb, path->slots[i]); eb = read_tree_block(root, bytenr, ptr_gen); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + return PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } @@ -2519,8 +2523,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, * counted. return -ENOENT if the block is root of reloc tree. */ static noinline_for_stack -struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, - struct backref_node *node) +struct btrfs_root *select_one_root(struct backref_node *node) { struct backref_node *next; struct btrfs_root *root; @@ -2710,7 +2713,10 @@ static int do_relocation(struct btrfs_trans_handle *trans, blocksize = root->nodesize; generation = btrfs_node_ptr_generation(upper->eb, slot); eb = read_tree_block(root, bytenr, generation); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + err = PTR_ERR(eb); + goto next; + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); err = -EIO; goto next; @@ -2873,7 +2879,9 @@ static int get_tree_block_key(struct reloc_control *rc, BUG_ON(block->key_ready); eb = read_tree_block(rc->extent_root, block->bytenr, block->key.offset); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + return PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } @@ -2903,7 +2911,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, return 0; BUG_ON(node->processed); - root = select_one_root(trans, node); + root = select_one_root(node); if (root == ERR_PTR(-ENOENT)) { update_processed_blocks(rc, node); goto out; @@ -3746,8 +3754,7 @@ out: * helper to find next unprocessed extent */ static noinline_for_stack -int find_next_extent(struct btrfs_trans_handle *trans, - struct reloc_control *rc, struct btrfs_path *path, +int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, struct btrfs_key *extent_key) { struct btrfs_key key; @@ -3942,7 +3949,7 @@ restart: continue; } - ret = find_next_extent(trans, rc, path, &key); + ret = find_next_extent(rc, path, &key); if (ret < 0) err = ret; if (ret != 0) @@ -3967,6 +3974,10 @@ restart: sizeof(struct btrfs_extent_item_v0)); ret = get_ref_objectid_v0(rc, path, &key, &ref_owner, &path_change); + if (ret < 0) { + err = ret; + break; + } if (ref_owner < BTRFS_FIRST_FREE_OBJECTID) flags = BTRFS_EXTENT_FLAG_TREE_BLOCK; else @@ -4040,7 +4051,7 @@ restart: if (trans && progress && err == -ENOSPC) { ret = btrfs_force_chunk_alloc(trans, rc->extent_root, rc->block_group->flags); - if (ret == 0) { + if (ret == 1) { err = 0; progress = 0; goto restart; @@ -4131,7 +4142,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans; struct btrfs_root *root; struct btrfs_key key; - u64 objectid = BTRFS_FIRST_FREE_OBJECTID; + u64 objectid; int err = 0; root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); @@ -4206,14 +4217,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) rc->block_group = btrfs_lookup_block_group(fs_info, group_start); BUG_ON(!rc->block_group); - if (!rc->block_group->ro) { - ret = btrfs_set_block_group_ro(extent_root, rc->block_group); - if (ret) { - err = ret; - goto out; - } - rw = 1; + ret = btrfs_inc_block_group_ro(extent_root, rc->block_group); + if (ret) { + err = ret; + goto out; } + rw = 1; path = btrfs_alloc_path(); if (!path) { @@ -4285,7 +4294,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); out: if (err && rw) - btrfs_set_block_group_rw(extent_root, rc->block_group); + btrfs_dec_block_group_ro(extent_root, rc->block_group); iput(rc->data_inode); btrfs_put_block_group(rc->block_group); kfree(rc); @@ -4585,8 +4594,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, * called before creating snapshot. it calculates metadata reservation * requried for relocating tree blocks in the snapshot */ -void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending, +void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve) { struct btrfs_root *root; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ab5811545a98..9a11db0c47ee 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -125,6 +125,7 @@ struct scrub_block { /* It is for the data with checksum */ unsigned int data_corrected:1; }; + struct btrfs_work work; }; /* Used for the chunks with parity stripe such RAID5/6 */ @@ -278,7 +279,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum, int force, u64 physical_for_dev_replace); -static void scrub_bio_end_io(struct bio *bio, int err); +static void scrub_bio_end_io(struct bio *bio); static void scrub_bio_end_io_worker(struct btrfs_work *work); static void scrub_block_complete(struct scrub_block *sblock); static void scrub_remap_extent(struct btrfs_fs_info *fs_info, @@ -295,7 +296,7 @@ static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage); static void scrub_wr_submit(struct scrub_ctx *sctx); -static void scrub_wr_bio_end_io(struct bio *bio, int err); +static void scrub_wr_bio_end_io(struct bio *bio); static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); static int write_page_nocow(struct scrub_ctx *sctx, u64 physical_for_dev_replace, struct page *page); @@ -332,11 +333,14 @@ static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) } } -static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +static void scrub_pause_on(struct btrfs_fs_info *fs_info) { atomic_inc(&fs_info->scrubs_paused); wake_up(&fs_info->scrub_pause_wait); +} +static void scrub_pause_off(struct btrfs_fs_info *fs_info) +{ mutex_lock(&fs_info->scrub_lock); __scrub_blocked_if_needed(fs_info); atomic_dec(&fs_info->scrubs_paused); @@ -345,6 +349,12 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) wake_up(&fs_info->scrub_pause_wait); } +static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +{ + scrub_pause_on(fs_info); + scrub_pause_off(fs_info); +} + /* * used for workers that require transaction commits (i.e., for the * NOCOW case) @@ -454,27 +464,14 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) struct scrub_ctx *sctx; int i; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; - int pages_per_rd_bio; int ret; - /* - * the setting of pages_per_rd_bio is correct for scrub but might - * be wrong for the dev_replace code where we might read from - * different devices in the initial huge bios. However, that - * code is able to correctly handle the case when adding a page - * to a bio fails. - */ - if (dev->bdev) - pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO, - bio_get_nr_vecs(dev->bdev)); - else - pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; sctx = kzalloc(sizeof(*sctx), GFP_NOFS); if (!sctx) goto nomem; atomic_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; - sctx->pages_per_rd_bio = pages_per_rd_bio; + sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; sctx->curr = -1; sctx->dev_root = dev->dev_root; for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { @@ -1429,11 +1426,11 @@ struct scrub_bio_ret { int error; }; -static void scrub_bio_wait_endio(struct bio *bio, int error) +static void scrub_bio_wait_endio(struct bio *bio) { struct scrub_bio_ret *ret = bio->bi_private; - ret->error = error; + ret->error = bio->bi_error; complete(&ret->event); } @@ -1790,12 +1787,12 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) btrfsic_submit_bio(WRITE, sbio->bio); } -static void scrub_wr_bio_end_io(struct bio *bio, int err) +static void scrub_wr_bio_end_io(struct bio *bio) { struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; - sbio->err = err; + sbio->err = bio->bi_error; sbio->bio = bio; btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, @@ -2087,21 +2084,7 @@ static void scrub_submit(struct scrub_ctx *sctx) sbio = sctx->bios[sctx->curr]; sctx->curr = -1; scrub_pending_bio_inc(sctx); - - if (!sbio->bio->bi_bdev) { - /* - * this case should not happen. If btrfs_map_block() is - * wrong, it could happen for dev-replace operations on - * missing devices when no mirrors are available, but in - * this case it should already fail the mount. - * This case is handled correctly (but _very_ slowly). - */ - printk_ratelimited(KERN_WARNING - "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n"); - bio_endio(sbio->bio, -EIO); - } else { - btrfsic_submit_bio(READ, sbio->bio); - } + btrfsic_submit_bio(READ, sbio->bio); } static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, @@ -2178,6 +2161,134 @@ again: return 0; } +static void scrub_missing_raid56_end_io(struct bio *bio) +{ + struct scrub_block *sblock = bio->bi_private; + struct btrfs_fs_info *fs_info = sblock->sctx->dev_root->fs_info; + + if (bio->bi_error) + sblock->no_io_error_seen = 0; + + btrfs_queue_work(fs_info->scrub_workers, &sblock->work); +} + +static void scrub_missing_raid56_worker(struct btrfs_work *work) +{ + struct scrub_block *sblock = container_of(work, struct scrub_block, work); + struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + unsigned int is_metadata; + unsigned int have_csum; + u8 *csum; + u64 generation; + u64 logical; + struct btrfs_device *dev; + + is_metadata = !(sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA); + have_csum = sblock->pagev[0]->have_csum; + csum = sblock->pagev[0]->csum; + generation = sblock->pagev[0]->generation; + logical = sblock->pagev[0]->logical; + dev = sblock->pagev[0]->dev; + + if (sblock->no_io_error_seen) { + scrub_recheck_block_checksum(fs_info, sblock, is_metadata, + have_csum, csum, generation, + sctx->csum_size); + } + + if (!sblock->no_io_error_seen) { + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors++; + spin_unlock(&sctx->stat_lock); + printk_ratelimited_in_rcu(KERN_ERR + "BTRFS: I/O error rebulding logical %llu for dev %s\n", + logical, rcu_str_deref(dev->name)); + } else if (sblock->header_error || sblock->checksum_error) { + spin_lock(&sctx->stat_lock); + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + printk_ratelimited_in_rcu(KERN_ERR + "BTRFS: failed to rebuild valid logical %llu for dev %s\n", + logical, rcu_str_deref(dev->name)); + } else { + scrub_write_block_to_dev_replace(sblock); + } + + scrub_block_put(sblock); + + if (sctx->is_dev_replace && + atomic_read(&sctx->wr_ctx.flush_all_writes)) { + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); + } + + scrub_pending_bio_dec(sctx); +} + +static void scrub_missing_raid56_pages(struct scrub_block *sblock) +{ + struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + u64 length = sblock->page_count * PAGE_SIZE; + u64 logical = sblock->pagev[0]->logical; + struct btrfs_bio *bbio; + struct bio *bio; + struct btrfs_raid_bio *rbio; + int ret; + int i; + + ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, &length, + &bbio, 0, 1); + if (ret || !bbio || !bbio->raid_map) + goto bbio_out; + + if (WARN_ON(!sctx->is_dev_replace || + !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) { + /* + * We shouldn't be scrubbing a missing device. Even for dev + * replace, we should only get here for RAID 5/6. We either + * managed to mount something with no mirrors remaining or + * there's a bug in scrub_remap_extent()/btrfs_map_block(). + */ + goto bbio_out; + } + + bio = btrfs_io_bio_alloc(GFP_NOFS, 0); + if (!bio) + goto bbio_out; + + bio->bi_iter.bi_sector = logical >> 9; + bio->bi_private = sblock; + bio->bi_end_io = scrub_missing_raid56_end_io; + + rbio = raid56_alloc_missing_rbio(sctx->dev_root, bio, bbio, length); + if (!rbio) + goto rbio_out; + + for (i = 0; i < sblock->page_count; i++) { + struct scrub_page *spage = sblock->pagev[i]; + + raid56_add_scrub_pages(rbio, spage->page, spage->logical); + } + + btrfs_init_work(&sblock->work, btrfs_scrub_helper, + scrub_missing_raid56_worker, NULL, NULL); + scrub_block_get(sblock); + scrub_pending_bio_inc(sctx); + raid56_submit_missing_rbio(rbio); + return; + +rbio_out: + bio_put(bio); +bbio_out: + btrfs_put_bbio(bbio); + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); +} + static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum, int force, @@ -2241,31 +2352,39 @@ leave_nomem: } WARN_ON(sblock->page_count == 0); - for (index = 0; index < sblock->page_count; index++) { - struct scrub_page *spage = sblock->pagev[index]; - int ret; + if (dev->missing) { + /* + * This case should only be hit for RAID 5/6 device replace. See + * the comment in scrub_missing_raid56_pages() for details. + */ + scrub_missing_raid56_pages(sblock); + } else { + for (index = 0; index < sblock->page_count; index++) { + struct scrub_page *spage = sblock->pagev[index]; + int ret; - ret = scrub_add_page_to_rd_bio(sctx, spage); - if (ret) { - scrub_block_put(sblock); - return ret; + ret = scrub_add_page_to_rd_bio(sctx, spage); + if (ret) { + scrub_block_put(sblock); + return ret; + } } - } - if (force) - scrub_submit(sctx); + if (force) + scrub_submit(sctx); + } /* last one frees, either here or in bio completion for last page */ scrub_block_put(sblock); return 0; } -static void scrub_bio_end_io(struct bio *bio, int err) +static void scrub_bio_end_io(struct bio *bio) { struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; - sbio->err = err; + sbio->err = bio->bi_error; sbio->bio = bio; btrfs_queue_work(fs_info->scrub_workers, &sbio->work); @@ -2564,6 +2683,11 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity, u8 csum[BTRFS_CSUM_SIZE]; u32 blocksize; + if (dev->missing) { + scrub_parity_mark_sectors_error(sparity, logical, len); + return 0; + } + if (flags & BTRFS_EXTENT_FLAG_DATA) { blocksize = sctx->sectorsize; } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { @@ -2662,18 +2786,30 @@ static void scrub_free_parity(struct scrub_parity *sparity) kfree(sparity); } -static void scrub_parity_bio_endio(struct bio *bio, int error) +static void scrub_parity_bio_endio_worker(struct btrfs_work *work) { - struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; + struct scrub_parity *sparity = container_of(work, struct scrub_parity, + work); struct scrub_ctx *sctx = sparity->sctx; - if (error) + scrub_free_parity(sparity); + scrub_pending_bio_dec(sctx); +} + +static void scrub_parity_bio_endio(struct bio *bio) +{ + struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; + + if (bio->bi_error) bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); - scrub_free_parity(sparity); - scrub_pending_bio_dec(sctx); bio_put(bio); + + btrfs_init_work(&sparity->work, btrfs_scrubparity_helper, + scrub_parity_bio_endio_worker, NULL, NULL); + btrfs_queue_work(sparity->sctx->dev_root->fs_info->scrub_parity_workers, + &sparity->work); } static void scrub_parity_check_and_repair(struct scrub_parity *sparity) @@ -2690,7 +2826,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) sparity->nsectors)) goto out; - length = sparity->logic_end - sparity->logic_start + 1; + length = sparity->logic_end - sparity->logic_start; ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE, sparity->logic_start, &length, &bbio, 0, 1); @@ -2713,8 +2849,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) goto rbio_out; list_for_each_entry(spage, &sparity->spages, list) - raid56_parity_add_scrub_pages(rbio, spage->page, - spage->logical); + raid56_add_scrub_pages(rbio, spage->page, spage->logical); scrub_pending_bio_inc(sctx); raid56_parity_submit_scrub_rbio(rbio); @@ -2762,6 +2897,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, struct btrfs_root *root = fs_info->extent_root; struct btrfs_root *csum_root = fs_info->csum_root; struct btrfs_extent_item *extent; + struct btrfs_bio *bbio = NULL; u64 flags; int ret; int slot; @@ -2771,6 +2907,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, u64 extent_logical; u64 extent_physical; u64 extent_len; + u64 mapped_length; struct btrfs_device *extent_dev; struct scrub_parity *sparity; int nsectors; @@ -2844,6 +2981,10 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, } btrfs_item_key_to_cpu(l, &key, slot); + if (key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) + goto next; + if (key.type == BTRFS_METADATA_ITEM_KEY) bytes = root->nodesize; else @@ -2852,11 +2993,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, if (key.objectid + bytes <= logic_start) goto next; - if (key.type != BTRFS_EXTENT_ITEM_KEY && - key.type != BTRFS_METADATA_ITEM_KEY) - goto next; - - if (key.objectid > logic_end) { + if (key.objectid >= logic_end) { stop_loop = 1; break; } @@ -2869,11 +3006,12 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, flags = btrfs_extent_flags(l, extent); generation = btrfs_extent_generation(l, extent); - if (key.objectid < logic_start && - (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { - btrfs_err(fs_info, - "scrub: tree block %llu spanning stripes, ignored. logical=%llu", - key.objectid, logic_start); + if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && + (key.objectid < logic_start || + key.objectid + bytes > + logic_start + map->stripe_len)) { + btrfs_err(fs_info, "scrub: tree block %llu spanning stripes, ignored. logical=%llu", + key.objectid, logic_start); goto next; } again: @@ -2893,10 +3031,21 @@ again: scrub_parity_mark_sectors_data(sparity, extent_logical, extent_len); - scrub_remap_extent(fs_info, extent_logical, - extent_len, &extent_physical, - &extent_dev, - &extent_mirror_num); + mapped_length = extent_len; + ret = btrfs_map_block(fs_info, READ, extent_logical, + &mapped_length, &bbio, 0); + if (!ret) { + if (!bbio || mapped_length < extent_len) + ret = -EIO; + } + if (ret) { + btrfs_put_bbio(bbio); + goto out; + } + extent_physical = bbio->stripes[0].physical; + extent_mirror_num = bbio->mirror_num; + extent_dev = bbio->stripes[0].dev; + btrfs_put_bbio(bbio); ret = btrfs_lookup_csums_range(csum_root, extent_logical, @@ -2911,10 +3060,12 @@ again: extent_dev, flags, generation, extent_mirror_num); + + scrub_free_csums(sctx); + if (ret) goto out; - scrub_free_csums(sctx); if (extent_logical + extent_len < key.objectid + bytes) { logic_start += map->stripe_len; @@ -2943,7 +3094,7 @@ next: out: if (ret < 0) scrub_parity_mark_sectors_error(sparity, logic_start, - logic_end - logic_start + 1); + logic_end - logic_start); scrub_parity_put(sparity); scrub_submit(sctx); mutex_lock(&sctx->wr_ctx.wr_lock); @@ -3092,22 +3243,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ ret = 0; while (physical < physical_end) { - /* for raid56, we skip parity stripe */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - ret = get_raid56_logic_offset(physical, num, - map, &logical, &stripe_logical); - logical += base; - if (ret) { - stripe_logical += base; - stripe_end = stripe_logical + increment - 1; - ret = scrub_raid56_parity(sctx, map, scrub_dev, - ppath, stripe_logical, - stripe_end); - if (ret) - goto out; - goto skip; - } - } /* * canceled? */ @@ -3132,6 +3267,24 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, scrub_blocked_if_needed(fs_info); } + /* for raid56, we skip parity stripe */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ret = get_raid56_logic_offset(physical, num, map, + &logical, + &stripe_logical); + logical += base; + if (ret) { + stripe_logical += base; + stripe_end = stripe_logical + increment; + ret = scrub_raid56_parity(sctx, map, scrub_dev, + ppath, stripe_logical, + stripe_end); + if (ret) + goto out; + goto skip; + } + } + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else @@ -3176,6 +3329,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } btrfs_item_key_to_cpu(l, &key, slot); + if (key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) + goto next; + if (key.type == BTRFS_METADATA_ITEM_KEY) bytes = root->nodesize; else @@ -3184,10 +3341,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (key.objectid + bytes <= logical) goto next; - if (key.type != BTRFS_EXTENT_ITEM_KEY && - key.type != BTRFS_METADATA_ITEM_KEY) - goto next; - if (key.objectid >= logical + map->stripe_len) { /* out of this device extent */ if (key.objectid >= logic_end) @@ -3200,8 +3353,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, flags = btrfs_extent_flags(l, extent); generation = btrfs_extent_generation(l, extent); - if (key.objectid < logical && - (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { + if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && + (key.objectid < logical || + key.objectid + bytes > + logical + map->stripe_len)) { btrfs_err(fs_info, "scrub: tree block %llu spanning " "stripes, ignored. logical=%llu", @@ -3235,9 +3390,11 @@ again: &extent_dev, &extent_mirror_num); - ret = btrfs_lookup_csums_range(csum_root, logical, - logical + map->stripe_len - 1, - &sctx->csum_list, 1); + ret = btrfs_lookup_csums_range(csum_root, + extent_logical, + extent_logical + + extent_len - 1, + &sctx->csum_list, 1); if (ret) goto out; @@ -3245,10 +3402,12 @@ again: extent_physical, extent_dev, flags, generation, extent_mirror_num, extent_logical - logical + physical); + + scrub_free_csums(sctx); + if (ret) goto out; - scrub_free_csums(sctx); if (extent_logical + extent_len < key.objectid + bytes) { if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { @@ -3266,7 +3425,7 @@ loop: if (ret && physical < physical_end) { stripe_logical += base; stripe_end = stripe_logical + - increment - 1; + increment; ret = scrub_raid56_parity(sctx, map, scrub_dev, ppath, stripe_logical, @@ -3375,7 +3534,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, u64 chunk_tree; u64 chunk_objectid; u64 chunk_offset; - int ret; + int ret = 0; int slot; struct extent_buffer *l; struct btrfs_key key; @@ -3403,8 +3562,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); - if (ret) + if (ret < 0) + break; + if (ret > 0) { + ret = 0; break; + } + } else { + ret = 0; } } @@ -3446,6 +3611,22 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!cache) goto skip; + /* + * we need call btrfs_inc_block_group_ro() with scrubs_paused, + * to avoid deadlock caused by: + * btrfs_inc_block_group_ro() + * -> btrfs_wait_for_commit() + * -> btrfs_commit_transaction() + * -> btrfs_scrub_pause() + */ + scrub_pause_on(fs_info); + ret = btrfs_inc_block_group_ro(root, cache); + scrub_pause_off(fs_info); + if (ret) { + btrfs_put_block_group(cache); + break; + } + dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; @@ -3471,8 +3652,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); - atomic_inc(&fs_info->scrubs_paused); - wake_up(&fs_info->scrub_pause_wait); + + scrub_pause_on(fs_info); /* * must be called before we decrease @scrub_paused. @@ -3483,11 +3664,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, atomic_read(&sctx->workers_pending) == 0); atomic_set(&sctx->wr_ctx.flush_all_writes, 0); - mutex_lock(&fs_info->scrub_lock); - __scrub_blocked_if_needed(fs_info); - atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - wake_up(&fs_info->scrub_pause_wait); + scrub_pause_off(fs_info); + + btrfs_dec_block_group_ro(root, cache); btrfs_put_block_group(cache); if (ret) @@ -3511,11 +3690,7 @@ skip: btrfs_free_path(path); - /* - * ret can still be 1 from search_slot or next_leaf, - * that's not an error - */ - return ret < 0 ? ret : 0; + return ret; } static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, @@ -3559,7 +3734,6 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { - int ret = 0; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; @@ -3572,27 +3746,36 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, fs_info->scrub_workers = btrfs_alloc_workqueue("btrfs-scrub", flags, max_active, 4); - if (!fs_info->scrub_workers) { - ret = -ENOMEM; - goto out; - } + if (!fs_info->scrub_workers) + goto fail_scrub_workers; + fs_info->scrub_wr_completion_workers = btrfs_alloc_workqueue("btrfs-scrubwrc", flags, max_active, 2); - if (!fs_info->scrub_wr_completion_workers) { - ret = -ENOMEM; - goto out; - } + if (!fs_info->scrub_wr_completion_workers) + goto fail_scrub_wr_completion_workers; + fs_info->scrub_nocow_workers = btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0); - if (!fs_info->scrub_nocow_workers) { - ret = -ENOMEM; - goto out; - } + if (!fs_info->scrub_nocow_workers) + goto fail_scrub_nocow_workers; + fs_info->scrub_parity_workers = + btrfs_alloc_workqueue("btrfs-scrubparity", flags, + max_active, 2); + if (!fs_info->scrub_parity_workers) + goto fail_scrub_parity_workers; } ++fs_info->scrub_workers_refcnt; -out: - return ret; + return 0; + +fail_scrub_parity_workers: + btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); +fail_scrub_nocow_workers: + btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); +fail_scrub_wr_completion_workers: + btrfs_destroy_workqueue(fs_info->scrub_workers); +fail_scrub_workers: + return -ENOMEM; } static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) @@ -3601,6 +3784,7 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->scrub_workers); btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); + btrfs_destroy_workqueue(fs_info->scrub_parity_workers); } WARN_ON(fs_info->scrub_workers_refcnt < 0); } @@ -3875,8 +4059,7 @@ static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, return 0; WARN_ON(!dev->bdev); - wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO, - bio_get_nr_vecs(dev->bdev)); + wr_ctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; wr_ctx->tgtdev = dev; atomic_set(&wr_ctx->flush_all_writes, 0); return 0; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index a1216f9b4917..aa72bfd28f7d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -243,6 +243,7 @@ struct waiting_dir_move { * after this directory is moved, we can try to rmdir the ino rmdir_ino. */ u64 rmdir_ino; + bool orphanized; }; struct orphan_dir_info { @@ -1158,6 +1159,9 @@ struct backref_ctx { /* may be truncated in case it's the last extent in a file */ u64 extent_len; + /* data offset in the file extent item */ + u64 data_offset; + /* Just to check for bugs in backref resolving */ int found_itself; }; @@ -1221,7 +1225,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) if (ret < 0) return ret; - if (offset + bctx->extent_len > i_size) + if (offset + bctx->data_offset + bctx->extent_len > i_size) return 0; /* @@ -1363,6 +1367,19 @@ static int find_extent_clone(struct send_ctx *sctx, backref_ctx->cur_offset = data_offset; backref_ctx->found_itself = 0; backref_ctx->extent_len = num_bytes; + /* + * For non-compressed extents iterate_extent_inodes() gives us extent + * offsets that already take into account the data offset, but not for + * compressed extents, since the offset is logical and not relative to + * the physical extent locations. We must take this into account to + * avoid sending clone offsets that go beyond the source file's size, + * which would result in the clone ioctl failing with -EINVAL on the + * receiving end. + */ + if (compressed == BTRFS_COMPRESS_NONE) + backref_ctx->data_offset = 0; + else + backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi); /* * The last extent of a file may be too large due to page alignment. @@ -1900,8 +1917,13 @@ static int did_overwrite_ref(struct send_ctx *sctx, goto out; } - /* we know that it is or will be overwritten. check this now */ - if (ow_inode < sctx->send_progress) + /* + * We know that it is or will be overwritten. Check this now. + * The current inode being processed might have been the one that caused + * inode 'ino' to be orphanized, therefore ow_inode can actually be the + * same as sctx->send_progress. + */ + if (ow_inode <= sctx->send_progress) ret = 1; else ret = 0; @@ -2223,6 +2245,8 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, fs_path_reset(dest); while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { + struct waiting_dir_move *wdm; + fs_path_reset(name); if (is_waiting_for_rm(sctx, ino)) { @@ -2233,7 +2257,11 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, break; } - if (is_waiting_for_move(sctx, ino)) { + wdm = get_waiting_dir_move(sctx, ino); + if (wdm && wdm->orphanized) { + ret = gen_unique_name(sctx, ino, gen, name); + stop = 1; + } else if (wdm) { ret = get_first_ref(sctx->parent_root, ino, &parent_inode, &parent_gen, name); } else { @@ -2328,8 +2356,12 @@ static int send_subvol_begin(struct send_ctx *sctx) TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, le64_to_cpu(sctx->send_root->root_item.ctransid)); if (parent_root) { - TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, - sctx->parent_root->root_item.uuid); + if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + parent_root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + parent_root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, le64_to_cpu(sctx->parent_root->root_item.ctransid)); } @@ -2923,7 +2955,7 @@ static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) return entry != NULL; } -static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) +static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized) { struct rb_node **p = &sctx->waiting_dir_moves.rb_node; struct rb_node *parent = NULL; @@ -2934,6 +2966,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) return -ENOMEM; dm->ino = ino; dm->rmdir_ino = 0; + dm->orphanized = orphanized; while (*p) { parent = *p; @@ -3030,7 +3063,7 @@ static int add_pending_dir_move(struct send_ctx *sctx, goto out; } - ret = add_waiting_dir_move(sctx, pm->ino); + ret = add_waiting_dir_move(sctx, pm->ino, is_orphan); if (ret) goto out; @@ -3353,8 +3386,40 @@ out: return ret; } +/* + * Check if ino ino1 is an ancestor of inode ino2 in the given root. + * Return 1 if true, 0 if false and < 0 on error. + */ +static int is_ancestor(struct btrfs_root *root, + const u64 ino1, + const u64 ino1_gen, + const u64 ino2, + struct fs_path *fs_path) +{ + u64 ino = ino2; + + while (ino > BTRFS_FIRST_FREE_OBJECTID) { + int ret; + u64 parent; + u64 parent_gen; + + fs_path_reset(fs_path); + ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); + if (ret < 0) { + if (ret == -ENOENT && ino == ino2) + ret = 0; + return ret; + } + if (parent == ino1) + return parent_gen == ino1_gen ? 1 : 0; + ino = parent; + } + return 0; +} + static int wait_for_parent_move(struct send_ctx *sctx, - struct recorded_ref *parent_ref) + struct recorded_ref *parent_ref, + const bool is_orphan) { int ret = 0; u64 ino = parent_ref->dir; @@ -3374,11 +3439,24 @@ static int wait_for_parent_move(struct send_ctx *sctx, * Our current directory inode may not yet be renamed/moved because some * ancestor (immediate or not) has to be renamed/moved first. So find if * such ancestor exists and make sure our own rename/move happens after - * that ancestor is processed. + * that ancestor is processed to avoid path build infinite loops (done + * at get_cur_path()). */ while (ino > BTRFS_FIRST_FREE_OBJECTID) { if (is_waiting_for_move(sctx, ino)) { - ret = 1; + /* + * If the current inode is an ancestor of ino in the + * parent root, we need to delay the rename of the + * current inode, otherwise don't delayed the rename + * because we can end up with a circular dependency + * of renames, resulting in some directories never + * getting the respective rename operations issued in + * the send stream or getting into infinite path build + * loops. + */ + ret = is_ancestor(sctx->parent_root, + sctx->cur_ino, sctx->cur_inode_gen, + ino, path_before); break; } @@ -3420,7 +3498,7 @@ out: ino, &sctx->new_refs, &sctx->deleted_refs, - false); + is_orphan); if (!ret) ret = 1; } @@ -3589,6 +3667,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); } } + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root && + can_rename) { + ret = wait_for_parent_move(sctx, cur, is_orphan); + if (ret < 0) + goto out; + if (ret == 1) { + can_rename = false; + *pending_move = 1; + } + } + /* * link/move the ref to the new place. If we have an orphan * inode, move it and update valid_path. If not, link or move @@ -3609,18 +3698,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * dirs, we always have one new and one deleted * ref. The deleted ref is ignored later. */ - ret = wait_for_parent_move(sctx, cur); - if (ret < 0) - goto out; - if (ret) { - *pending_move = 1; - } else { - ret = send_rename(sctx, valid_path, - cur->full_path); - if (!ret) - ret = fs_path_copy(valid_path, - cur->full_path); - } + ret = send_rename(sctx, valid_path, + cur->full_path); + if (!ret) + ret = fs_path_copy(valid_path, + cur->full_path); if (ret < 0) goto out; } else { @@ -4508,8 +4590,21 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " if (ret < 0) goto out; - TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, - clone_root->root->root_item.uuid); + /* + * If the parent we're using has a received_uuid set then use that as + * our clone source as that is what we will look for when doing a + * receive. + * + * This covers the case that we create a snapshot off of a received + * subvolume and then use that as the parent and try to receive on a + * different host. + */ + if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + clone_root->root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + clone_root->root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, le64_to_cpu(clone_root->root->root_item.ctransid)); TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9e66f5e724db..2b07b3581781 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -69,7 +69,7 @@ static struct file_system_type btrfs_fs_type; static int btrfs_remount(struct super_block *sb, int *flags, char *data); -static const char *btrfs_decode_error(int errno) +const char *btrfs_decode_error(int errno) { char *errstr = "unknown"; @@ -135,6 +135,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) * __btrfs_std_error decodes expected errors from the caller and * invokes the approciate error response. */ +__cold void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { @@ -247,18 +248,11 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, * We'll complete the cleanup in btrfs_end_transaction and * btrfs_commit_transaction. */ +__cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno) { - /* - * Report first abort since mount - */ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, - &root->fs_info->fs_state)) { - WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n", - errno); - } trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ @@ -281,6 +275,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, * __btrfs_panic decodes unexpected, fatal errors from the caller, * issues an alert, and either panics or BUGs, depending on mount options. */ +__cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { @@ -841,33 +836,153 @@ out: return error; } -static struct dentry *get_default_root(struct super_block *sb, - u64 subvol_objectid) +static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; - struct btrfs_root *new_root; - struct btrfs_dir_item *di; - struct btrfs_path *path; - struct btrfs_key location; - struct inode *inode; - u64 dir_id; - int new = 0; + struct btrfs_root *fs_root; + struct btrfs_root_ref *root_ref; + struct btrfs_inode_ref *inode_ref; + struct btrfs_key key; + struct btrfs_path *path = NULL; + char *name = NULL, *ptr; + u64 dirid; + int len; + int ret; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto err; + } + path->leave_spinning = 1; + + name = kmalloc(PATH_MAX, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto err; + } + ptr = name + PATH_MAX - 1; + ptr[0] = '\0'; /* - * We have a specific subvol we want to mount, just setup location and - * go look up the root. + * Walk up the subvolume trees in the tree of tree roots by root + * backrefs until we hit the top-level subvolume. */ - if (subvol_objectid) { - location.objectid = subvol_objectid; - location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = (u64)-1; - goto find_root; + while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) { + key.objectid = subvol_objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = btrfs_previous_item(root, path, subvol_objectid, + BTRFS_ROOT_BACKREF_KEY); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = -ENOENT; + goto err; + } + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + subvol_objectid = key.offset; + + root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_root_ref); + len = btrfs_root_ref_name_len(path->nodes[0], root_ref); + ptr -= len + 1; + if (ptr < name) { + ret = -ENAMETOOLONG; + goto err; + } + read_extent_buffer(path->nodes[0], ptr + 1, + (unsigned long)(root_ref + 1), len); + ptr[0] = '/'; + dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref); + btrfs_release_path(path); + + key.objectid = subvol_objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + fs_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(fs_root)) { + ret = PTR_ERR(fs_root); + goto err; + } + + /* + * Walk up the filesystem tree by inode refs until we hit the + * root directory. + */ + while (dirid != BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = dirid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = btrfs_previous_item(fs_root, path, dirid, + BTRFS_INODE_REF_KEY); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = -ENOENT; + goto err; + } + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + dirid = key.offset; + + inode_ref = btrfs_item_ptr(path->nodes[0], + path->slots[0], + struct btrfs_inode_ref); + len = btrfs_inode_ref_name_len(path->nodes[0], + inode_ref); + ptr -= len + 1; + if (ptr < name) { + ret = -ENAMETOOLONG; + goto err; + } + read_extent_buffer(path->nodes[0], ptr + 1, + (unsigned long)(inode_ref + 1), len); + ptr[0] = '/'; + btrfs_release_path(path); + } + } + + btrfs_free_path(path); + if (ptr == name + PATH_MAX - 1) { + name[0] = '/'; + name[1] = '\0'; + } else { + memmove(name, ptr, name + PATH_MAX - ptr); } + return name; + +err: + btrfs_free_path(path); + kfree(name); + return ERR_PTR(ret); +} + +static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid) +{ + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_dir_item *di; + struct btrfs_path *path; + struct btrfs_key location; + u64 dir_id; path = btrfs_alloc_path(); if (!path) - return ERR_PTR(-ENOMEM); + return -ENOMEM; path->leave_spinning = 1; /* @@ -879,58 +994,23 @@ static struct dentry *get_default_root(struct super_block *sb, di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); if (IS_ERR(di)) { btrfs_free_path(path); - return ERR_CAST(di); + return PTR_ERR(di); } if (!di) { /* * Ok the default dir item isn't there. This is weird since * it's always been there, but don't freak out, just try and - * mount to root most subvolume. + * mount the top-level subvolume. */ btrfs_free_path(path); - dir_id = BTRFS_FIRST_FREE_OBJECTID; - new_root = fs_info->fs_root; - goto setup_root; + *objectid = BTRFS_FS_TREE_OBJECTID; + return 0; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); btrfs_free_path(path); - -find_root: - new_root = btrfs_read_fs_root_no_name(fs_info, &location); - if (IS_ERR(new_root)) - return ERR_CAST(new_root); - - if (!(sb->s_flags & MS_RDONLY)) { - int ret; - down_read(&fs_info->cleanup_work_sem); - ret = btrfs_orphan_cleanup(new_root); - up_read(&fs_info->cleanup_work_sem); - if (ret) - return ERR_PTR(ret); - } - - dir_id = btrfs_root_dirid(&new_root->root_item); -setup_root: - location.objectid = dir_id; - location.type = BTRFS_INODE_ITEM_KEY; - location.offset = 0; - - inode = btrfs_iget(sb, &location, new_root, &new); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - /* - * If we're just mounting the root most subvol put the inode and return - * a reference to the dentry. We will have already gotten a reference - * to the inode in btrfs_fill_super so we're good to go. - */ - if (!new && d_inode(sb->s_root) == inode) { - iput(inode); - return dget(sb->s_root); - } - - return d_obtain_root(inode); + *objectid = location.objectid; + return 0; } static int btrfs_fill_super(struct super_block *sb, @@ -953,6 +1033,7 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_flags |= MS_POSIXACL; #endif sb->s_flags |= MS_I_VERSION; + sb->s_iflags |= SB_I_CGROUPWB; err = open_ctree(sb, fs_devices, (char *)data); if (err) { printk(KERN_ERR "BTRFS: open_ctree failed\n"); @@ -1108,6 +1189,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",fatal_errors=panic"); if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) seq_printf(seq, ",commit=%d", info->commit_interval); + seq_printf(seq, ",subvolid=%llu", + BTRFS_I(d_inode(dentry))->root->root_key.objectid); + seq_puts(seq, ",subvol="); + seq_dentry(seq, dentry, " \t\n\\"); return 0; } @@ -1138,107 +1223,139 @@ static inline int is_subvolume_inode(struct inode *inode) } /* - * This will strip out the subvol=%s argument for an argument string and add - * subvolid=0 to make sure we get the actual tree root for path walking to the - * subvol we want. + * This will add subvolid=0 to the argument string while removing any subvol= + * and subvolid= arguments to make sure we get the top-level root for path + * walking to the subvol we want. */ static char *setup_root_args(char *args) { - unsigned len = strlen(args) + 2 + 1; - char *src, *dst, *buf; - - /* - * We need the same args as before, but with this substitution: - * s!subvol=[^,]+!subvolid=0! - * - * Since the replacement string is up to 2 bytes longer than the - * original, allocate strlen(args) + 2 + 1 bytes. - */ + char *buf, *dst, *sep; - src = strstr(args, "subvol="); - /* This shouldn't happen, but just in case.. */ - if (!src) - return NULL; + if (!args) + return kstrdup("subvolid=0", GFP_NOFS); - buf = dst = kmalloc(len, GFP_NOFS); + /* The worst case is that we add ",subvolid=0" to the end. */ + buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, GFP_NOFS); if (!buf) return NULL; - /* - * If the subvol= arg is not at the start of the string, - * copy whatever precedes it into buf. - */ - if (src != args) { - *src++ = '\0'; - strcpy(buf, args); - dst += strlen(args); + while (1) { + sep = strchrnul(args, ','); + if (!strstarts(args, "subvol=") && + !strstarts(args, "subvolid=")) { + memcpy(dst, args, sep - args); + dst += sep - args; + *dst++ = ','; + } + if (*sep) + args = sep + 1; + else + break; } - strcpy(dst, "subvolid=0"); - dst += strlen("subvolid=0"); - - /* - * If there is a "," after the original subvol=... string, - * copy that suffix into our buffer. Otherwise, we're done. - */ - src = strchr(src, ','); - if (src) - strcpy(dst, src); return buf; } -static struct dentry *mount_subvol(const char *subvol_name, int flags, - const char *device_name, char *data) +static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, + int flags, const char *device_name, + char *data) { struct dentry *root; - struct vfsmount *mnt; + struct vfsmount *mnt = NULL; char *newargs; + int ret; newargs = setup_root_args(data); - if (!newargs) - return ERR_PTR(-ENOMEM); - mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, - newargs); + if (!newargs) { + root = ERR_PTR(-ENOMEM); + goto out; + } - if (PTR_RET(mnt) == -EBUSY) { + mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); + if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) { if (flags & MS_RDONLY) { - mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name, - newargs); + mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, + device_name, newargs); } else { - int r; - mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name, - newargs); + mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, + device_name, newargs); if (IS_ERR(mnt)) { - kfree(newargs); - return ERR_CAST(mnt); + root = ERR_CAST(mnt); + mnt = NULL; + goto out; } - r = btrfs_remount(mnt->mnt_sb, &flags, NULL); - if (r < 0) { - /* FIXME: release vfsmount mnt ??*/ - kfree(newargs); - return ERR_PTR(r); + down_write(&mnt->mnt_sb->s_umount); + ret = btrfs_remount(mnt->mnt_sb, &flags, NULL); + up_write(&mnt->mnt_sb->s_umount); + if (ret < 0) { + root = ERR_PTR(ret); + goto out; } } } + if (IS_ERR(mnt)) { + root = ERR_CAST(mnt); + mnt = NULL; + goto out; + } - kfree(newargs); + if (!subvol_name) { + if (!subvol_objectid) { + ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb), + &subvol_objectid); + if (ret) { + root = ERR_PTR(ret); + goto out; + } + } + subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb), + subvol_objectid); + if (IS_ERR(subvol_name)) { + root = ERR_CAST(subvol_name); + subvol_name = NULL; + goto out; + } - if (IS_ERR(mnt)) - return ERR_CAST(mnt); + } root = mount_subtree(mnt, subvol_name); + /* mount_subtree() drops our reference on the vfsmount. */ + mnt = NULL; - if (!IS_ERR(root) && !is_subvolume_inode(d_inode(root))) { + if (!IS_ERR(root)) { struct super_block *s = root->d_sb; - dput(root); - root = ERR_PTR(-EINVAL); - deactivate_locked_super(s); - printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n", - subvol_name); + struct inode *root_inode = d_inode(root); + u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid; + + ret = 0; + if (!is_subvolume_inode(root_inode)) { + pr_err("BTRFS: '%s' is not a valid subvolume\n", + subvol_name); + ret = -EINVAL; + } + if (subvol_objectid && root_objectid != subvol_objectid) { + /* + * This will also catch a race condition where a + * subvolume which was passed by ID is renamed and + * another subvolume is renamed over the old location. + */ + pr_err("BTRFS: subvol '%s' does not match subvolid %llu\n", + subvol_name, subvol_objectid); + ret = -EINVAL; + } + if (ret) { + dput(root); + root = ERR_PTR(ret); + deactivate_locked_super(s); + } } +out: + mntput(mnt); + kfree(newargs); + kfree(subvol_name); return root; } @@ -1303,7 +1420,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, { struct block_device *bdev = NULL; struct super_block *s; - struct dentry *root; struct btrfs_fs_devices *fs_devices = NULL; struct btrfs_fs_info *fs_info = NULL; struct security_mnt_opts new_sec_opts; @@ -1323,10 +1439,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, return ERR_PTR(error); } - if (subvol_name) { - root = mount_subvol(subvol_name, flags, device_name, data); - kfree(subvol_name); - return root; + if (subvol_name || subvol_objectid != BTRFS_FS_TREE_OBJECTID) { + /* mount_subvol() will free subvol_name. */ + return mount_subvol(subvol_name, subvol_objectid, flags, + device_name, data); } security_init_mnt_opts(&new_sec_opts); @@ -1392,23 +1508,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); } - - root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); - if (IS_ERR(root)) { + if (error) { deactivate_locked_super(s); - error = PTR_ERR(root); goto error_sec_opts; } fs_info = btrfs_sb(s); error = setup_security_options(fs_info, s, &new_sec_opts); if (error) { - dput(root); deactivate_locked_super(s); goto error_sec_opts; } - return root; + return dget(s->s_root); error_close_devices: btrfs_close_devices(fs_devices); @@ -1539,6 +1651,17 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) sb->s_flags |= MS_RDONLY; + /* + * Setting MS_RDONLY will put the cleaner thread to + * sleep at the next loop if it's already active. + * If it's already asleep, we'll leave unused block + * groups on disk until we're mounted read-write again + * unless we clean them up here. + */ + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_delete_unused_bgs(fs_info); + mutex_unlock(&root->fs_info->cleaner_mutex); + btrfs_dev_replace_suspend_for_unmount(fs_info); btrfs_scrub_cancel(fs_info); btrfs_pause_balance(fs_info); @@ -2052,8 +2175,7 @@ static int btrfs_interface_init(void) static void btrfs_interface_exit(void) { - if (misc_deregister(&btrfs_misc) < 0) - printk(KERN_INFO "BTRFS: misc_deregister failed for control device\n"); + misc_deregister(&btrfs_misc); } static void btrfs_print_info(void) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index e8a4c86d274d..603b0cc2b9bb 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -33,6 +33,7 @@ #include "volumes.h" static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); +static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); static u64 get_features(struct btrfs_fs_info *fs_info, enum btrfs_feature_set set) @@ -428,7 +429,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); -static struct attribute *btrfs_attrs[] = { +static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(label), BTRFS_ATTR_PTR(nodesize), BTRFS_ATTR_PTR(sectorsize), @@ -438,21 +439,29 @@ static struct attribute *btrfs_attrs[] = { static void btrfs_release_super_kobj(struct kobject *kobj) { - struct btrfs_fs_info *fs_info = to_fs_info(kobj); - complete(&fs_info->kobj_unregister); + struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj); + + memset(&fs_devs->super_kobj, 0, sizeof(struct kobject)); + complete(&fs_devs->kobj_unregister); } static struct kobj_type btrfs_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = btrfs_release_super_kobj, - .default_attrs = btrfs_attrs, }; +static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj) +{ + if (kobj->ktype != &btrfs_ktype) + return NULL; + return container_of(kobj, struct btrfs_fs_devices, super_kobj); +} + static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) { if (kobj->ktype != &btrfs_ktype) return NULL; - return container_of(kobj, struct btrfs_fs_info, super_kobj); + return to_fs_devs(kobj)->fs_info; } #define NUM_FEATURE_BITS 64 @@ -493,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) attrs[0] = &fa->kobj_attr.attr; if (add) { int ret; - ret = sysfs_merge_group(&fs_info->super_kobj, + ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj, &agroup); if (ret) return ret; } else - sysfs_unmerge_group(&fs_info->super_kobj, + sysfs_unmerge_group(&fs_info->fs_devices->super_kobj, &agroup); } @@ -506,25 +515,49 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) return 0; } -static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) +static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) +{ + if (fs_devs->device_dir_kobj) { + kobject_del(fs_devs->device_dir_kobj); + kobject_put(fs_devs->device_dir_kobj); + fs_devs->device_dir_kobj = NULL; + } + + if (fs_devs->super_kobj.state_initialized) { + kobject_del(&fs_devs->super_kobj); + kobject_put(&fs_devs->super_kobj); + wait_for_completion(&fs_devs->kobj_unregister); + } +} + +/* when fs_devs is NULL it will remove all fsid kobject */ +void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) { - kobject_del(&fs_info->super_kobj); - kobject_put(&fs_info->super_kobj); - wait_for_completion(&fs_info->kobj_unregister); + struct list_head *fs_uuids = btrfs_get_fs_uuids(); + + if (fs_devs) { + __btrfs_sysfs_remove_fsid(fs_devs); + return; + } + + list_for_each_entry(fs_devs, fs_uuids, list) { + __btrfs_sysfs_remove_fsid(fs_devs); + } } void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) { + btrfs_reset_fs_info_ptr(fs_info); + if (fs_info->space_info_kobj) { sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs); kobject_del(fs_info->space_info_kobj); kobject_put(fs_info->space_info_kobj); } - kobject_del(fs_info->device_dir_kobj); - kobject_put(fs_info->device_dir_kobj); addrm_unknown_feature_attrs(fs_info, false); - sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group); - __btrfs_sysfs_remove_one(fs_info); + sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group); + sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs); + btrfs_kobj_rm_device(fs_info->fs_devices, NULL); } const char * const btrfs_feature_set_names[3] = { @@ -602,40 +635,60 @@ static void init_feature_attrs(void) } } -int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, +/* when one_device is NULL, it removes all device links */ + +int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { struct hd_struct *disk; struct kobject *disk_kobj; - if (!fs_info->device_dir_kobj) + if (!fs_devices->device_dir_kobj) return -EINVAL; if (one_device && one_device->bdev) { disk = one_device->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(fs_info->device_dir_kobj, + sysfs_remove_link(fs_devices->device_dir_kobj, + disk_kobj->name); + } + + if (one_device) + return 0; + + list_for_each_entry(one_device, + &fs_devices->devices, dev_list) { + if (!one_device->bdev) + continue; + disk = one_device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + + sysfs_remove_link(fs_devices->device_dir_kobj, disk_kobj->name); } return 0; } -int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, - struct btrfs_device *one_device) +int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs) { - int error = 0; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - struct btrfs_device *dev; - - if (!fs_info->device_dir_kobj) - fs_info->device_dir_kobj = kobject_create_and_add("devices", - &fs_info->super_kobj); + if (!fs_devs->device_dir_kobj) + fs_devs->device_dir_kobj = kobject_create_and_add("devices", + &fs_devs->super_kobj); - if (!fs_info->device_dir_kobj) + if (!fs_devs->device_dir_kobj) return -ENOMEM; + return 0; +} + +int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *one_device) +{ + int error = 0; + struct btrfs_device *dev; + list_for_each_entry(dev, &fs_devices->devices, dev_list) { struct hd_struct *disk; struct kobject *disk_kobj; @@ -649,7 +702,7 @@ int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, disk = dev->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; - error = sysfs_create_link(fs_info->device_dir_kobj, + error = sysfs_create_link(fs_devices->device_dir_kobj, disk_kobj, disk_kobj->name); if (error) break; @@ -667,34 +720,51 @@ static struct dentry *btrfs_debugfs_root_dentry; /* Debugging tunables and exported data */ u64 btrfs_debugfs_test; +/* + * Can be called by the device discovery thread. + * And parent can be specified for seed device + */ +int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, + struct kobject *parent) +{ + int error; + + init_completion(&fs_devs->kobj_unregister); + fs_devs->super_kobj.kset = btrfs_kset; + error = kobject_init_and_add(&fs_devs->super_kobj, + &btrfs_ktype, parent, "%pU", fs_devs->fsid); + return error; +} + int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) { int error; + struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; + struct kobject *super_kobj = &fs_devs->super_kobj; + + btrfs_set_fs_info_ptr(fs_info); - init_completion(&fs_info->kobj_unregister); - fs_info->super_kobj.kset = btrfs_kset; - error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL, - "%pU", fs_info->fsid); + error = btrfs_kobj_add_device(fs_devs, NULL); if (error) return error; - error = sysfs_create_group(&fs_info->super_kobj, - &btrfs_feature_attr_group); + error = sysfs_create_files(super_kobj, btrfs_attrs); if (error) { - __btrfs_sysfs_remove_one(fs_info); + btrfs_kobj_rm_device(fs_devs, NULL); return error; } - error = addrm_unknown_feature_attrs(fs_info, true); + error = sysfs_create_group(super_kobj, + &btrfs_feature_attr_group); if (error) goto failure; - error = btrfs_kobj_add_device(fs_info, NULL); + error = addrm_unknown_feature_attrs(fs_info, true); if (error) goto failure; fs_info->space_info_kobj = kobject_create_and_add("allocation", - &fs_info->super_kobj); + super_kobj); if (!fs_info->space_info_kobj) { error = -ENOMEM; goto failure; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 3a4bbed723fd..6392527bcc15 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -82,8 +82,12 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); extern const char * const btrfs_feature_set_names[3]; extern struct kobj_type space_info_ktype; extern struct kobj_type btrfs_raid_ktype; -int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, +int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); -int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, +int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); +int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, + struct kobject *parent); +int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs); +void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); #endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index c32a7ba76bca..846d277b1901 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -21,6 +21,7 @@ #include "../transaction.h" #include "../disk-io.h" #include "../qgroup.h" +#include "../backref.h" static void init_dummy_trans(struct btrfs_trans_handle *trans) { @@ -227,6 +228,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; + struct ulist *old_roots = NULL; + struct ulist *new_roots = NULL; int ret; init_dummy_trans(&trans); @@ -238,10 +241,15 @@ static int test_no_shared_qgroup(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); + /* + * Since the test trans doesn't havee the complicated delayed refs, + * we can only call btrfs_qgroup_account_extent() directly to test + * quota. + */ + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); if (ret) { - test_msg("Couldn't add space to a qgroup %d\n", ret); + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } @@ -249,9 +257,18 @@ static int test_no_shared_qgroup(struct btrfs_root *root) if (ret) return ret; - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + if (ret) { + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Delayed qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } @@ -259,21 +276,32 @@ static int test_no_shared_qgroup(struct btrfs_root *root) test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } + old_roots = NULL; + new_roots = NULL; + + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } ret = remove_extent_item(root, 4096, 4096); if (ret) return -EINVAL; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, - BTRFS_QGROUP_OPER_SUB_EXCL, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Couldn't remove space from the qgroup %d\n", ret); - return -EINVAL; + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return -EINVAL; } @@ -294,6 +322,8 @@ static int test_multiple_refs(struct btrfs_root *root) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; + struct ulist *old_roots = NULL; + struct ulist *new_roots = NULL; int ret; init_dummy_trans(&trans); @@ -307,20 +337,29 @@ static int test_multiple_refs(struct btrfs_root *root) return ret; } + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); if (ret) return ret; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Couldn't add space to a qgroup %d\n", ret); + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Delayed qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } @@ -329,20 +368,29 @@ static int test_multiple_refs(struct btrfs_root *root) return -EINVAL; } + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + ret = add_tree_ref(root, 4096, 4096, 0, 256); if (ret) return ret; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, - BTRFS_QGROUP_OPER_ADD_SHARED, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Qgroup record ref failed %d\n", ret); + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } @@ -356,20 +404,29 @@ static int test_multiple_refs(struct btrfs_root *root) return -EINVAL; } + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + ret = remove_extent_ref(root, 4096, 4096, 0, 256); if (ret) return ret; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, - BTRFS_QGROUP_OPER_SUB_SHARED, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Qgroup record ref failed %d\n", ret); + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5628e25250c0..8f259b3a66b3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -225,12 +225,14 @@ loop: cur_trans->dirty_bg_run = 0; cur_trans->delayed_refs.href_root = RB_ROOT; + cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; atomic_set(&cur_trans->delayed_refs.num_entries, 0); cur_trans->delayed_refs.num_heads_ready = 0; cur_trans->delayed_refs.pending_csums = 0; cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.flushing = 0; cur_trans->delayed_refs.run_delayed_start = 0; + cur_trans->delayed_refs.qgroup_to_skip = 0; /* * although the tree mod log is per file system and not per transaction, @@ -256,6 +258,8 @@ loop: mutex_init(&cur_trans->cache_write_mutex); cur_trans->num_dirty_bgs = 0; spin_lock_init(&cur_trans->dirty_bgs_lock); + INIT_LIST_HEAD(&cur_trans->deleted_bgs); + spin_lock_init(&cur_trans->deleted_bgs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -509,6 +513,7 @@ again: h->transaction = cur_trans; h->blocks_used = 0; h->bytes_reserved = 0; + h->chunk_bytes_reserved = 0; h->root = root; h->delayed_ref_updates = 0; h->use_count = 1; @@ -758,7 +763,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->ordered)) { spin_lock(&info->trans_lock); - list_splice(&trans->ordered, &cur_trans->pending_ordered); + list_splice_init(&trans->ordered, &cur_trans->pending_ordered); spin_unlock(&info->trans_lock); } @@ -792,6 +797,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); + btrfs_trans_release_chunk_metadata(trans); + if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && should_end_transaction(trans, root) && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) { @@ -1290,7 +1297,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (pending->error) goto no_free_objectid; - btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); + /* + * Make qgroup to skip current new snapshot's qgroupid, as it is + * accounted by later btrfs_qgroup_inherit(). + */ + btrfs_set_skip_qgroup(trans, objectid); + + btrfs_reloc_pre_snapshot(pending, &to_reserve); if (to_reserve > 0) { pending->error = btrfs_block_rsv_add(root, @@ -1298,7 +1311,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, to_reserve, BTRFS_RESERVE_NO_FLUSH); if (pending->error) - goto no_free_objectid; + goto clear_skip_qgroup; } key.objectid = objectid; @@ -1396,25 +1409,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, root, ret); goto fail; } - - /* - * We need to flush delayed refs in order to make sure all of our quota - * operations have been done before we call btrfs_qgroup_inherit. - */ - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto fail; - } - - ret = btrfs_qgroup_inherit(trans, fs_info, - root->root_key.objectid, - objectid, pending->inherit); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto fail; - } - /* see comments in should_cow_block() */ set_bit(BTRFS_ROOT_FORCE_COW, &root->state); smp_wmb(); @@ -1497,11 +1491,37 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } } + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + /* + * account qgroup counters before qgroup_inherit() + */ + ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); + if (ret) + goto fail; + ret = btrfs_qgroup_account_extents(trans, fs_info); + if (ret) + goto fail; + ret = btrfs_qgroup_inherit(trans, fs_info, + root->root_key.objectid, + objectid, pending->inherit); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + fail: pending->error = ret; dir_item_existed: trans->block_rsv = rsv; trans->bytes_reserved = 0; +clear_skip_qgroup: + btrfs_clear_skip_qgroup(trans); no_free_objectid: kfree(new_root_item); root_item_alloc_fail: @@ -1620,9 +1640,7 @@ static void do_async_commit(struct work_struct *work) * Tell lockdep about it. */ if (ac->newtrans->type & __TRANS_FREEZABLE) - rwsem_acquire_read( - &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); + __sb_writers_acquired(ac->root->fs_info->sb, SB_FREEZE_FS); current->journal_info = ac->newtrans; @@ -1661,9 +1679,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, * async commit thread will be the one to unlock it. */ if (ac->newtrans->type & __TRANS_FREEZABLE) - rwsem_release( - &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 1, _THIS_IP_); + __sb_writers_release(root->fs_info->sb, SB_FREEZE_FS); schedule_work(&ac->work); @@ -1848,7 +1864,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } spin_lock(&root->fs_info->trans_lock); - list_splice(&trans->ordered, &cur_trans->pending_ordered); + list_splice_init(&trans->ordered, &cur_trans->pending_ordered); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { spin_unlock(&root->fs_info->trans_lock); atomic_inc(&cur_trans->use_count); @@ -1875,8 +1891,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, spin_unlock(&root->fs_info->trans_lock); wait_for_commit(root, prev_trans); + ret = prev_trans->aborted; btrfs_put_transaction(prev_trans); + if (ret) + goto cleanup_transaction; } else { spin_unlock(&root->fs_info->trans_lock); } @@ -1963,6 +1982,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto scrub_continue; } + /* Reocrd old roots for later qgroup accounting */ + ret = btrfs_qgroup_prepare_account_extents(trans, root->fs_info); + if (ret) { + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + /* * make sure none of the code above managed to slip in a * delayed item @@ -2004,6 +2030,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ btrfs_free_log_root_tree(trans, root->fs_info); + /* + * Since fs roots are all committed, we can get a quite accurate + * new_roots. So let's do quota accounting. + */ + ret = btrfs_qgroup_account_extents(trans, root->fs_info); + if (ret < 0) { + mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + ret = commit_cowonly_roots(trans, root); if (ret) { mutex_unlock(&root->fs_info->tree_log_mutex); @@ -2054,6 +2091,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags); clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags); + btrfs_trans_release_chunk_metadata(trans); + spin_lock(&root->fs_info->trans_lock); cur_trans->state = TRANS_STATE_UNBLOCKED; root->fs_info->running_transaction = NULL; @@ -2114,7 +2153,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); - if (current != root->fs_info->transaction_kthread) + if (current != root->fs_info->transaction_kthread && + current != root->fs_info->cleaner_kthread) btrfs_run_delayed_iputs(root); return ret; @@ -2123,6 +2163,7 @@ scrub_continue: btrfs_scrub_continue(root); cleanup_transaction: btrfs_trans_release_metadata(trans, root); + btrfs_trans_release_chunk_metadata(trans); trans->block_rsv = NULL; if (trans->qgroup_reserved) { btrfs_qgroup_free(root, trans->qgroup_reserved); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 0b24755596ba..edc2fbc262d7 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -74,6 +74,8 @@ struct btrfs_transaction { */ struct mutex cache_write_mutex; spinlock_t dirty_bgs_lock; + struct list_head deleted_bgs; + spinlock_t deleted_bgs_lock; struct btrfs_delayed_ref_root delayed_refs; int aborted; int dirty_bg_run; @@ -102,6 +104,7 @@ struct btrfs_transaction { struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; + u64 chunk_bytes_reserved; u64 qgroup_reserved; unsigned long use_count; unsigned long blocks_reserved; @@ -153,6 +156,29 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, spin_unlock(&BTRFS_I(inode)->lock); } +/* + * Make qgroup codes to skip given qgroupid, means the old/new_roots for + * qgroup won't contain the qgroupid in it. + */ +static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans, + u64 qgroupid) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + WARN_ON(delayed_refs->qgroup_to_skip); + delayed_refs->qgroup_to_skip = qgroupid; +} + +static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + WARN_ON(!delayed_refs->qgroup_to_skip); + delayed_refs->qgroup_to_skip = 0; +} + int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index a63719cc9578..a4b9c8b2d35a 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -52,9 +52,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) goto out; - if (btrfs_test_opt(root, SSD)) - goto out; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d04968374e9d..1bbaace73383 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -140,55 +140,46 @@ static int start_log_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_log_ctx *ctx) { - int index; - int ret; + int ret = 0; mutex_lock(&root->log_mutex); + if (root->log_root) { if (btrfs_need_log_full_commit(root->fs_info, trans)) { ret = -EAGAIN; goto out; } + if (!root->log_start_pid) { - root->log_start_pid = current->pid; clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); + root->log_start_pid = current->pid; } else if (root->log_start_pid != current->pid) { set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } + } else { + mutex_lock(&root->fs_info->tree_log_mutex); + if (!root->fs_info->log_root_tree) + ret = btrfs_init_log_root_tree(trans, root->fs_info); + mutex_unlock(&root->fs_info->tree_log_mutex); + if (ret) + goto out; - atomic_inc(&root->log_batch); - atomic_inc(&root->log_writers); - if (ctx) { - index = root->log_transid % 2; - list_add_tail(&ctx->list, &root->log_ctxs[index]); - ctx->log_transid = root->log_transid; - } - mutex_unlock(&root->log_mutex); - return 0; - } - - ret = 0; - mutex_lock(&root->fs_info->tree_log_mutex); - if (!root->fs_info->log_root_tree) - ret = btrfs_init_log_root_tree(trans, root->fs_info); - mutex_unlock(&root->fs_info->tree_log_mutex); - if (ret) - goto out; - - if (!root->log_root) { ret = btrfs_add_log_tree(trans, root); if (ret) goto out; + + clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); + root->log_start_pid = current->pid; } - clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); - root->log_start_pid = current->pid; + atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); if (ctx) { - index = root->log_transid % 2; + int index = root->log_transid % 2; list_add_tail(&ctx->list, &root->log_ctxs[index]); ctx->log_transid = root->log_transid; } + out: mutex_unlock(&root->log_mutex); return ret; @@ -731,12 +722,66 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, &ordered_sums, 0); if (ret) goto out; + /* + * Now delete all existing cums in the csum root that + * cover our range. We do this because we can have an + * extent that is completely referenced by one file + * extent item and partially referenced by another + * file extent item (like after using the clone or + * extent_same ioctls). In this case if we end up doing + * the replay of the one that partially references the + * extent first, and we do not do the csum deletion + * below, we can get 2 csum items in the csum tree that + * overlap each other. For example, imagine our log has + * the two following file extent items: + * + * key (257 EXTENT_DATA 409600) + * extent data disk byte 12845056 nr 102400 + * extent data offset 20480 nr 20480 ram 102400 + * + * key (257 EXTENT_DATA 819200) + * extent data disk byte 12845056 nr 102400 + * extent data offset 0 nr 102400 ram 102400 + * + * Where the second one fully references the 100K extent + * that starts at disk byte 12845056, and the log tree + * has a single csum item that covers the entire range + * of the extent: + * + * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 + * + * After the first file extent item is replayed, the + * csum tree gets the following csum item: + * + * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 + * + * Which covers the 20K sub-range starting at offset 20K + * of our extent. Now when we replay the second file + * extent item, if we do not delete existing csum items + * that cover any of its blocks, we end up getting two + * csum items in our csum tree that overlap each other: + * + * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 + * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 + * + * Which is a problem, because after this anyone trying + * to lookup up for the checksum of any block of our + * extent starting at an offset of 40K or higher, will + * end up looking at the second csum item only, which + * does not contain the checksum for any block starting + * at offset 40K or higher of our extent. + */ while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums; sums = list_entry(ordered_sums.next, struct btrfs_ordered_sum, list); if (!ret) + ret = btrfs_del_csums(trans, + root->fs_info->csum_root, + sums->bytenr, + sums->len); + if (!ret) ret = btrfs_csum_file_blocks(trans, root->fs_info->csum_root, sums); @@ -1549,9 +1594,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, */ static noinline int insert_one_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, u64 dirid, u64 index, - char *name, int name_len, u8 type, + char *name, int name_len, struct btrfs_key *location) { struct inode *inode; @@ -1613,6 +1657,9 @@ static bool name_in_log_ref(struct btrfs_root *log_root, * not exist in the FS, it is skipped. fsyncs on directories * do not force down inodes inside that directory, just changes to the * names or unlinks in a directory. + * + * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a + * non-existing inode) and 1 if the name was replayed. */ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -1631,6 +1678,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, int exists; int ret = 0; bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); + bool name_added = false; dir = read_one_inode(root, key->objectid); if (!dir) @@ -1708,6 +1756,8 @@ out: } kfree(name); iput(dir); + if (!ret && name_added) + ret = 1; return ret; insert: @@ -1719,10 +1769,12 @@ insert: goto out; } btrfs_release_path(path); - ret = insert_one_name(trans, root, path, key->objectid, key->offset, - name, name_len, log_type, &log_key); + ret = insert_one_name(trans, root, key->objectid, key->offset, + name, name_len, &log_key); if (ret && ret != -ENOENT && ret != -EEXIST) goto out; + if (!ret) + name_added = true; update_size = false; ret = 0; goto out; @@ -1740,12 +1792,13 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - int ret; + int ret = 0; u32 item_size = btrfs_item_size_nr(eb, slot); struct btrfs_dir_item *di; int name_len; unsigned long ptr; unsigned long ptr_end; + struct btrfs_path *fixup_path = NULL; ptr = btrfs_item_ptr_offset(eb, slot); ptr_end = ptr + item_size; @@ -1755,12 +1808,59 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, return -EIO; name_len = btrfs_dir_name_len(eb, di); ret = replay_one_name(trans, root, path, eb, di, key); - if (ret) - return ret; + if (ret < 0) + break; ptr = (unsigned long)(di + 1); ptr += name_len; + + /* + * If this entry refers to a non-directory (directories can not + * have a link count > 1) and it was added in the transaction + * that was not committed, make sure we fixup the link count of + * the inode it the entry points to. Otherwise something like + * the following would result in a directory pointing to an + * inode with a wrong link that does not account for this dir + * entry: + * + * mkdir testdir + * touch testdir/foo + * touch testdir/bar + * sync + * + * ln testdir/bar testdir/bar_link + * ln testdir/foo testdir/foo_link + * xfs_io -c "fsync" testdir/bar + * + * <power failure> + * + * mount fs, log replay happens + * + * File foo would remain with a link count of 1 when it has two + * entries pointing to it in the directory testdir. This would + * make it impossible to ever delete the parent directory has + * it would result in stale dentries that can never be deleted. + */ + if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { + struct btrfs_key di_key; + + if (!fixup_path) { + fixup_path = btrfs_alloc_path(); + if (!fixup_path) { + ret = -ENOMEM; + break; + } + } + + btrfs_dir_item_key_to_cpu(eb, di, &di_key); + ret = link_to_fixup_dir(trans, root, fixup_path, + di_key.objectid); + if (ret) + break; + } + ret = 0; } - return 0; + btrfs_free_path(fixup_path); + return ret; } /* @@ -2535,8 +2635,7 @@ static int update_log_root(struct btrfs_trans_handle *trans, return ret; } -static void wait_log_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int transid) +static void wait_log_commit(struct btrfs_root *root, int transid) { DEFINE_WAIT(wait); int index = transid % 2; @@ -2561,8 +2660,7 @@ static void wait_log_commit(struct btrfs_trans_handle *trans, atomic_read(&root->log_commit[index])); } -static void wait_for_writer(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static void wait_for_writer(struct btrfs_root *root) { DEFINE_WAIT(wait); @@ -2642,7 +2740,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index1 = log_transid % 2; if (atomic_read(&root->log_commit[index1])) { - wait_log_commit(trans, root, log_transid); + wait_log_commit(root, log_transid); mutex_unlock(&root->log_mutex); return ctx->log_ret; } @@ -2651,7 +2749,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* wait for previous tree log sync to complete */ if (atomic_read(&root->log_commit[(index1 + 1) % 2])) - wait_log_commit(trans, root, log_transid - 1); + wait_log_commit(root, log_transid - 1); while (1) { int batch = atomic_read(&root->log_batch); @@ -2662,7 +2760,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, schedule_timeout_uninterruptible(1); mutex_lock(&root->log_mutex); } - wait_for_writer(trans, root); + wait_for_writer(root); if (batch == atomic_read(&root->log_batch)) break; } @@ -2759,7 +2857,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_wait_logged_extents(trans, log, log_transid); - wait_log_commit(trans, log_root_tree, + wait_log_commit(log_root_tree, root_log_ctx.log_transid); mutex_unlock(&log_root_tree->log_mutex); if (!ret) @@ -2770,11 +2868,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, atomic_set(&log_root_tree->log_commit[index2], 1); if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { - wait_log_commit(trans, log_root_tree, + wait_log_commit(log_root_tree, root_log_ctx.log_transid - 1); } - wait_for_writer(trans, log_root_tree); + wait_for_writer(log_root_tree); /* * now that we've moved on to the tree of log tree roots, @@ -3881,12 +3979,6 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, &ordered->flags)) continue; - if (ordered->csum_bytes_left) { - btrfs_start_ordered_extent(inode, ordered, 0); - wait_event(ordered->wait, - ordered->csum_bytes_left == 0); - } - list_for_each_entry(sum, &ordered->list, list) { ret = btrfs_csum_file_blocks(trans, log, sum); if (ret) @@ -4123,6 +4215,187 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode, return 0; } +/* + * At the moment we always log all xattrs. This is to figure out at log replay + * time which xattrs must have their deletion replayed. If a xattr is missing + * in the log tree and exists in the fs/subvol tree, we delete it. This is + * because if a xattr is deleted, the inode is fsynced and a power failure + * happens, causing the log to be replayed the next time the fs is mounted, + * we want the xattr to not exist anymore (same behaviour as other filesystems + * with a journal, ext3/4, xfs, f2fs, etc). + */ +static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path) +{ + int ret; + struct btrfs_key key; + const u64 ino = btrfs_ino(inode); + int ins_nr = 0; + int start_slot = 0; + + key.objectid = ino; + key.type = BTRFS_XATTR_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + while (true) { + int slot = path->slots[0]; + struct extent_buffer *leaf = path->nodes[0]; + int nritems = btrfs_header_nritems(leaf); + + if (slot >= nritems) { + if (ins_nr > 0) { + u64 last_extent = 0; + + ret = copy_items(trans, inode, dst_path, path, + &last_extent, start_slot, + ins_nr, 1, 0); + /* can't be 1, extent items aren't processed */ + ASSERT(ret <= 0); + if (ret < 0) + return ret; + ins_nr = 0; + } + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) + break; + + if (ins_nr == 0) + start_slot = slot; + ins_nr++; + path->slots[0]++; + cond_resched(); + } + if (ins_nr > 0) { + u64 last_extent = 0; + + ret = copy_items(trans, inode, dst_path, path, + &last_extent, start_slot, + ins_nr, 1, 0); + /* can't be 1, extent items aren't processed */ + ASSERT(ret <= 0); + if (ret < 0) + return ret; + } + + return 0; +} + +/* + * If the no holes feature is enabled we need to make sure any hole between the + * last extent and the i_size of our inode is explicitly marked in the log. This + * is to make sure that doing something like: + * + * 1) create file with 128Kb of data + * 2) truncate file to 64Kb + * 3) truncate file to 256Kb + * 4) fsync file + * 5) <crash/power failure> + * 6) mount fs and trigger log replay + * + * Will give us a file with a size of 256Kb, the first 64Kb of data match what + * the file had in its first 64Kb of data at step 1 and the last 192Kb of the + * file correspond to a hole. The presence of explicit holes in a log tree is + * what guarantees that log replay will remove/adjust file extent items in the + * fs/subvol tree. + * + * Here we do not need to care about holes between extents, that is already done + * by copy_items(). We also only need to do this in the full sync path, where we + * lookup for extents from the fs/subvol tree only. In the fast path case, we + * lookup the list of modified extent maps and if any represents a hole, we + * insert a corresponding extent representing a hole in the log tree. + */ +static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path) +{ + int ret; + struct btrfs_key key; + u64 hole_start; + u64 hole_size; + struct extent_buffer *leaf; + struct btrfs_root *log = root->log_root; + const u64 ino = btrfs_ino(inode); + const u64 i_size = i_size_read(inode); + + if (!btrfs_fs_incompat(root->fs_info, NO_HOLES)) + return 0; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ASSERT(ret != 0); + if (ret < 0) + return ret; + + ASSERT(path->slots[0] > 0); + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { + /* inode does not have any extents */ + hole_start = 0; + hole_size = i_size; + } else { + struct btrfs_file_extent_item *extent; + u64 len; + + /* + * If there's an extent beyond i_size, an explicit hole was + * already inserted by copy_items(). + */ + if (key.offset >= i_size) + return 0; + + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, extent) == + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(leaf, + path->slots[0], + extent); + ASSERT(len == i_size); + return 0; + } + + len = btrfs_file_extent_num_bytes(leaf, extent); + /* Last extent goes beyond i_size, no need to log a hole. */ + if (key.offset + len > i_size) + return 0; + hole_start = key.offset + len; + hole_size = i_size - hole_start; + } + btrfs_release_path(path); + + /* Last extent ends at i_size. */ + if (hole_size == 0) + return 0; + + hole_size = ALIGN(hole_size, root->sectorsize); + ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, + hole_size, 0, hole_size, 0, 0, 0); + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -4161,6 +4434,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; u64 logged_isize = 0; + bool need_log_inode_item = true; path = btrfs_alloc_path(); if (!path) @@ -4269,11 +4543,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } else { if (inode_only == LOG_INODE_ALL) fast_search = true; - ret = log_inode_item(trans, log, dst_path, inode); - if (ret) { - err = ret; - goto out_unlock; - } goto log_extents; } @@ -4296,6 +4565,28 @@ again: if (min_key.type > max_key.type) break; + if (min_key.type == BTRFS_INODE_ITEM_KEY) + need_log_inode_item = false; + + /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ + if (min_key.type == BTRFS_XATTR_ITEM_KEY) { + if (ins_nr == 0) + goto next_slot; + ret = copy_items(trans, inode, dst_path, path, + &last_extent, ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret < 0) { + err = ret; + goto out_unlock; + } + ins_nr = 0; + if (ret) { + btrfs_release_path(path); + continue; + } + goto next_slot; + } + src = path->nodes[0]; if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { ins_nr++; @@ -4363,9 +4654,26 @@ next_slot: ins_nr = 0; } + btrfs_release_path(path); + btrfs_release_path(dst_path); + err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); + if (err) + goto out_unlock; + if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { + btrfs_release_path(path); + btrfs_release_path(dst_path); + err = btrfs_log_trailing_hole(trans, root, inode, path); + if (err) + goto out_unlock; + } log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); + if (need_log_inode_item) { + err = log_inode_item(trans, log, dst_path, inode); + if (err) + goto out_unlock; + } if (fast_search) { /* * Some ordered extents started by fsync might have completed @@ -4694,6 +5002,94 @@ next_dir_inode: return ret; } +static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_log_ctx *ctx) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_root *root = BTRFS_I(inode)->root; + const u64 ino = btrfs_ino(inode); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->skip_locking = 1; + path->search_commit_root = 1; + + key.objectid = ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + u32 cur_offset = 0; + u32 item_size; + unsigned long ptr; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ + if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) + break; + + item_size = btrfs_item_size_nr(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); + while (cur_offset < item_size) { + struct btrfs_key inode_key; + struct inode *dir_inode; + + inode_key.type = BTRFS_INODE_ITEM_KEY; + inode_key.offset = 0; + + if (key.type == BTRFS_INODE_EXTREF_KEY) { + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *) + (ptr + cur_offset); + inode_key.objectid = btrfs_inode_extref_parent( + leaf, extref); + cur_offset += sizeof(*extref); + cur_offset += btrfs_inode_extref_name_len(leaf, + extref); + } else { + inode_key.objectid = key.offset; + cur_offset = item_size; + } + + dir_inode = btrfs_iget(root->fs_info->sb, &inode_key, + root, NULL); + /* If parent inode was deleted, skip it. */ + if (IS_ERR(dir_inode)) + continue; + + ret = btrfs_log_inode(trans, root, dir_inode, + LOG_INODE_ALL, 0, LLONG_MAX, ctx); + iput(dir_inode); + if (ret) + goto out; + } + path->slots[0]++; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -4713,9 +5109,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct dentry *old_parent = NULL; int ret = 0; u64 last_committed = root->fs_info->last_trans_committed; - const struct dentry * const first_parent = parent; - const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans > - last_committed); bool log_dentries = false; struct inode *orig_inode = inode; @@ -4776,6 +5169,53 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries) log_dentries = true; + /* + * On unlink we must make sure all our current and old parent directores + * inodes are fully logged. This is to prevent leaving dangling + * directory index entries in directories that were our parents but are + * not anymore. Not doing this results in old parent directory being + * impossible to delete after log replay (rmdir will always fail with + * error -ENOTEMPTY). + * + * Example 1: + * + * mkdir testdir + * touch testdir/foo + * ln testdir/foo testdir/bar + * sync + * unlink testdir/bar + * xfs_io -c fsync testdir/foo + * <power failure> + * mount fs, triggers log replay + * + * If we don't log the parent directory (testdir), after log replay the + * directory still has an entry pointing to the file inode using the bar + * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and + * the file inode has a link count of 1. + * + * Example 2: + * + * mkdir testdir + * touch foo + * ln foo testdir/foo2 + * ln foo testdir/foo3 + * sync + * unlink testdir/foo3 + * xfs_io -c fsync foo + * <power failure> + * mount fs, triggers log replay + * + * Similar as the first example, after log replay the parent directory + * testdir still has an entry pointing to the inode file with name foo3 + * but the file inode does not have a matching BTRFS_INODE_REF_KEY item + * and has a link count of 2. + */ + if (BTRFS_I(inode)->last_unlink_trans > last_committed) { + ret = btrfs_log_all_parents(trans, orig_inode, ctx); + if (ret) + goto end_trans; + } + while (1) { if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) break; @@ -4784,23 +5224,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (root != BTRFS_I(inode)->root) break; - /* - * On unlink we must make sure our immediate parent directory - * inode is fully logged. This is to prevent leaving dangling - * directory index entries and a wrong directory inode's i_size. - * Not doing so can result in a directory being impossible to - * delete after log replay (rmdir will always fail with error - * -ENOTEMPTY). - */ - if (did_unlink && parent == first_parent) - inode_only = LOG_INODE_ALL; - else - inode_only = LOG_INODE_EXISTS; - - if (BTRFS_I(inode)->generation > - root->fs_info->last_trans_committed || - inode_only == LOG_INODE_ALL) { - ret = btrfs_log_inode(trans, root, inode, inode_only, + if (BTRFS_I(inode)->generation > last_committed) { + ret = btrfs_log_inode(trans, root, inode, + LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); if (ret) goto end_trans; diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 840a38b2778a..91feb2bdefee 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -132,6 +132,15 @@ static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val) return NULL; } +static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node) +{ + rb_erase(&node->rb_node, &ulist->root); + list_del(&node->list); + kfree(node); + BUG_ON(ulist->nnodes == 0); + ulist->nnodes--; +} + static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins) { struct rb_node **p = &ulist->root.rb_node; @@ -197,9 +206,6 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, node->val = val; node->aux = aux; -#ifdef CONFIG_BTRFS_DEBUG - node->seqnum = ulist->nnodes; -#endif ret = ulist_rbtree_insert(ulist, node); ASSERT(!ret); @@ -209,6 +215,33 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, return 1; } +/* + * ulist_del - delete one node from ulist + * @ulist: ulist to remove node from + * @val: value to delete + * @aux: aux to delete + * + * The deletion will only be done when *BOTH* val and aux matches. + * Return 0 for successful delete. + * Return > 0 for not found. + */ +int ulist_del(struct ulist *ulist, u64 val, u64 aux) +{ + struct ulist_node *node; + + node = ulist_rbtree_search(ulist, val); + /* Not found */ + if (!node) + return 1; + + if (node->aux != aux) + return 1; + + /* Found and delete */ + ulist_rbtree_erase(ulist, node); + return 0; +} + /** * ulist_next - iterate ulist * @ulist: ulist to iterate @@ -237,15 +270,7 @@ struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter) uiter->cur_list = uiter->cur_list->next; } else { uiter->cur_list = ulist->nodes.next; -#ifdef CONFIG_BTRFS_DEBUG - uiter->i = 0; -#endif } node = list_entry(uiter->cur_list, struct ulist_node, list); -#ifdef CONFIG_BTRFS_DEBUG - ASSERT(node->seqnum == uiter->i); - ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes); - uiter->i++; -#endif return node; } diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 4c29db604bbe..a01a2c45825f 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -57,6 +57,7 @@ void ulist_free(struct ulist *ulist); int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, u64 *old_aux, gfp_t gfp_mask); +int ulist_del(struct ulist *ulist, u64 val, u64 aux); /* just like ulist_add_merge() but take a pointer for the aux data */ static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 53af23f2c087..76201d6f6ce4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -52,6 +52,10 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); +struct list_head *btrfs_get_fs_uuids(void) +{ + return &fs_uuids; +} static struct btrfs_fs_devices *__alloc_fs_devices(void) { @@ -441,6 +445,61 @@ static void pending_bios_fn(struct btrfs_work *work) run_scheduled_bios(device); } + +void btrfs_free_stale_device(struct btrfs_device *cur_dev) +{ + struct btrfs_fs_devices *fs_devs; + struct btrfs_device *dev; + + if (!cur_dev->name) + return; + + list_for_each_entry(fs_devs, &fs_uuids, list) { + int del = 1; + + if (fs_devs->opened) + continue; + if (fs_devs->seeding) + continue; + + list_for_each_entry(dev, &fs_devs->devices, dev_list) { + + if (dev == cur_dev) + continue; + if (!dev->name) + continue; + + /* + * Todo: This won't be enough. What if the same device + * comes back (with new uuid and) with its mapper path? + * But for now, this does help as mostly an admin will + * either use mapper or non mapper path throughout. + */ + rcu_read_lock(); + del = strcmp(rcu_str_deref(dev->name), + rcu_str_deref(cur_dev->name)); + rcu_read_unlock(); + if (!del) + break; + } + + if (!del) { + /* delete the stale device */ + if (fs_devs->num_devices == 1) { + btrfs_sysfs_remove_fsid(fs_devs); + list_del(&fs_devs->list); + free_fs_devices(fs_devs); + } else { + fs_devs->num_devices--; + list_del(&dev->dev_list); + rcu_string_free(dev->name); + kfree(dev); + } + break; + } + } +} + /* * Add new device to list of registered devices * @@ -556,6 +615,12 @@ static noinline int device_list_add(const char *path, if (!fs_devices->opened) device->generation = found_transid; + /* + * if there is new btrfs on an already registered device, + * then remove the stale device entry. + */ + btrfs_free_stale_device(device); + *fs_devices_ret = fs_devices; return ret; @@ -693,13 +758,13 @@ static void free_device(struct rcu_head *head) static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { - struct btrfs_device *device; + struct btrfs_device *device, *tmp; if (--fs_devices->opened > 0) return 0; mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { struct btrfs_device *new_device; struct rcu_string *name; @@ -1051,15 +1116,18 @@ out: return ret; } -static int contains_pending_extent(struct btrfs_trans_handle *trans, +static int contains_pending_extent(struct btrfs_transaction *transaction, struct btrfs_device *device, u64 *start, u64 len) { + struct btrfs_fs_info *fs_info = device->dev_root->fs_info; struct extent_map *em; - struct list_head *search_list = &trans->transaction->pending_chunks; + struct list_head *search_list = &fs_info->pinned_chunks; int ret = 0; u64 physical_start = *start; + if (transaction) + search_list = &transaction->pending_chunks; again: list_for_each_entry(em, search_list, list) { struct map_lookup *map; @@ -1067,19 +1135,35 @@ again: map = (struct map_lookup *)em->bdev; for (i = 0; i < map->num_stripes; i++) { + u64 end; + if (map->stripes[i].dev != device) continue; if (map->stripes[i].physical >= physical_start + len || map->stripes[i].physical + em->orig_block_len <= physical_start) continue; - *start = map->stripes[i].physical + - em->orig_block_len; - ret = 1; + /* + * Make sure that while processing the pinned list we do + * not override our *start with a lower value, because + * we can have pinned chunks that fall within this + * device hole and that have lower physical addresses + * than the pending chunks we processed before. If we + * do not take this special care we can end up getting + * 2 pending chunks that start at the same physical + * device offsets because the end offset of a pinned + * chunk can be equal to the start offset of some + * pending chunk. + */ + end = map->stripes[i].physical + em->orig_block_len; + if (end > *start) { + *start = end; + ret = 1; + } } } - if (search_list == &trans->transaction->pending_chunks) { - search_list = &trans->root->fs_info->pinned_chunks; + if (search_list != &fs_info->pinned_chunks) { + search_list = &fs_info->pinned_chunks; goto again; } @@ -1088,12 +1172,13 @@ again: /* - * find_free_dev_extent - find free space in the specified device - * @device: the device which we search the free space in - * @num_bytes: the size of the free space that we need - * @start: store the start of the free space. - * @len: the size of the free space. that we find, or the size of the max - * free space if we don't find suitable free space + * find_free_dev_extent_start - find free space in the specified device + * @device: the device which we search the free space in + * @num_bytes: the size of the free space that we need + * @search_start: the position from which to begin the search + * @start: store the start of the free space. + * @len: the size of the free space. that we find, or the size + * of the max free space if we don't find suitable free space * * this uses a pretty simple search, the expectation is that it is * called very infrequently and that a given device has a small number @@ -1107,9 +1192,9 @@ again: * But if we don't find suitable free space, it is used to store the size of * the max free space. */ -int find_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 num_bytes, - u64 *start, u64 *len) +int find_free_dev_extent_start(struct btrfs_transaction *transaction, + struct btrfs_device *device, u64 num_bytes, + u64 search_start, u64 *start, u64 *len) { struct btrfs_key key; struct btrfs_root *root = device->dev_root; @@ -1119,19 +1204,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, u64 max_hole_start; u64 max_hole_size; u64 extent_end; - u64 search_start; u64 search_end = device->total_bytes; int ret; int slot; struct extent_buffer *l; - /* FIXME use last free of some kind */ - - /* we don't want to overwrite the superblock on the drive, - * so we make sure to start at an offset of at least 1MB - */ - search_start = max(root->fs_info->alloc_start, 1024ull * 1024); - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1192,7 +1269,7 @@ again: * Have to check before we set max_hole_start, otherwise * we could end up sending back this offset anyway. */ - if (contains_pending_extent(trans, device, + if (contains_pending_extent(transaction, device, &search_start, hole_size)) { if (key.offset >= search_start) { @@ -1241,7 +1318,7 @@ next: if (search_end > search_start) { hole_size = search_end - search_start; - if (contains_pending_extent(trans, device, &search_start, + if (contains_pending_extent(transaction, device, &search_start, hole_size)) { btrfs_release_path(path); goto again; @@ -1267,6 +1344,24 @@ out: return ret; } +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, + u64 *start, u64 *len) +{ + struct btrfs_root *root = device->dev_root; + u64 search_start; + + /* FIXME use last free of some kind */ + + /* + * we don't want to overwrite the superblock on the drive, + * so we make sure to start at an offset of at least 1MB + */ + search_start = max(root->fs_info->alloc_start, 1024ull * 1024); + return find_free_dev_extent_start(trans->transaction, device, + num_bytes, search_start, start, len); +} + static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 start, u64 *dev_extent_len) @@ -1706,7 +1801,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->bdev) { device->fs_devices->open_devices--; /* remove sysfs entry */ - btrfs_kobj_rm_device(root->fs_info, device); + btrfs_kobj_rm_device(root->fs_info->fs_devices, device); } call_rcu(&device->rcu, free_device); @@ -1875,6 +1970,9 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, mutex_lock(&uuid_mutex); WARN_ON(!tgtdev); mutex_lock(&fs_info->fs_devices->device_list_mutex); + + btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev); + if (tgtdev->bdev) { btrfs_scratch_superblock(tgtdev); fs_info->fs_devices->open_devices--; @@ -2211,7 +2309,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) tmp + 1); /* add sysfs device entry */ - btrfs_kobj_add_device(root->fs_info, device); + btrfs_kobj_add_device(root->fs_info->fs_devices, device); /* * we've got more storage, clear any full flags on the space @@ -2252,8 +2350,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) */ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", root->fs_info->fsid); - if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) - goto error_trans; + if (kobject_rename(&root->fs_info->fs_devices->super_kobj, + fsid_buf)) + pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n"); } root->fs_info->num_tolerated_disk_barrier_failures = @@ -2289,7 +2388,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) error_trans: btrfs_end_transaction(trans, root); rcu_string_free(device->name); - btrfs_kobj_rm_device(root->fs_info, device); + btrfs_kobj_rm_device(root->fs_info->fs_devices, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); @@ -2609,6 +2708,9 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, return -EINVAL; } map = (struct map_lookup *)em->bdev; + lock_chunks(root->fs_info->chunk_root); + check_system_chunk(trans, extent_root, map->type); + unlock_chunks(root->fs_info->chunk_root); for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; @@ -2667,9 +2769,7 @@ out: return ret; } -static int btrfs_relocate_chunk(struct btrfs_root *root, - u64 chunk_objectid, - u64 chunk_offset) +static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset) { struct btrfs_root *extent_root; struct btrfs_trans_handle *trans; @@ -2678,12 +2778,28 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, root = root->fs_info->chunk_root; extent_root = root->fs_info->extent_root; + /* + * Prevent races with automatic removal of unused block groups. + * After we relocate and before we remove the chunk with offset + * chunk_offset, automatic removal of the block group can kick in, + * resulting in a failure when calling btrfs_remove_chunk() below. + * + * Make sure to acquire this mutex before doing a tree search (dev + * or chunk trees) to find chunks. Otherwise the cleaner kthread might + * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after + * we release the path used to search the chunk/dev tree and before + * the current task acquires this mutex and calls us. + */ + ASSERT(mutex_is_locked(&root->fs_info->delete_unused_bgs_mutex)); + ret = btrfs_can_relocate(extent_root, chunk_offset); if (ret) return -ENOSPC; /* step one, relocate all the extents inside this chunk */ + btrfs_scrub_pause(root); ret = btrfs_relocate_block_group(extent_root, chunk_offset); + btrfs_scrub_continue(root); if (ret) return ret; @@ -2726,13 +2842,18 @@ again: key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { + mutex_lock(&root->fs_info->delete_unused_bgs_mutex); ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); - if (ret < 0) + if (ret < 0) { + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); goto error; + } BUG_ON(ret == 0); /* Corruption */ ret = btrfs_previous_item(chunk_root, path, key.objectid, key.type); + if (ret) + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (ret < 0) goto error; if (ret > 0) @@ -2748,13 +2869,13 @@ again: if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_relocate_chunk(chunk_root, - found_key.objectid, found_key.offset); if (ret == -ENOSPC) failed++; else BUG_ON(ret); } + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (found_key.offset == 0) break; @@ -3211,9 +3332,12 @@ again: goto error; } + mutex_lock(&fs_info->delete_unused_bgs_mutex); ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); - if (ret < 0) + if (ret < 0) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); goto error; + } /* * this shouldn't happen, it means the last relocate @@ -3225,6 +3349,7 @@ again: ret = btrfs_previous_item(chunk_root, path, 0, BTRFS_CHUNK_ITEM_KEY); if (ret) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); ret = 0; break; } @@ -3233,8 +3358,10 @@ again: slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &found_key, slot); - if (found_key.objectid != key.objectid) + if (found_key.objectid != key.objectid) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); break; + } chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); @@ -3247,10 +3374,13 @@ again: ret = should_balance_chunk(chunk_root, leaf, chunk, found_key.offset); btrfs_release_path(path); - if (!ret) + if (!ret) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); goto loop; + } if (counting) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); spin_lock(&fs_info->balance_lock); bctl->stat.expected++; spin_unlock(&fs_info->balance_lock); @@ -3258,8 +3388,8 @@ again: } ret = btrfs_relocate_chunk(chunk_root, - found_key.objectid, found_key.offset); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); if (ret && ret != -ENOSPC) goto error; if (ret == -ENOSPC) { @@ -3908,9 +4038,9 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) uuid_root = btrfs_create_tree(trans, fs_info, BTRFS_UUID_TREE_OBJECTID); if (IS_ERR(uuid_root)) { - btrfs_abort_transaction(trans, tree_root, - PTR_ERR(uuid_root)); - return PTR_ERR(uuid_root); + ret = PTR_ERR(uuid_root); + btrfs_abort_transaction(trans, tree_root, ret); + return ret; } fs_info->uuid_root = uuid_root; @@ -3959,12 +4089,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; u64 length; - u64 chunk_objectid; u64 chunk_offset; int ret; int slot; int failed = 0; bool retried = false; + bool checked_pending_chunks = false; struct extent_buffer *l; struct btrfs_key key; struct btrfs_super_block *super_copy = root->fs_info->super_copy; @@ -3998,11 +4128,16 @@ again: key.type = BTRFS_DEV_EXTENT_KEY; do { + mutex_lock(&root->fs_info->delete_unused_bgs_mutex); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) + if (ret < 0) { + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); goto done; + } ret = btrfs_previous_item(root, path, 0, key.type); + if (ret) + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (ret < 0) goto done; if (ret) { @@ -4016,6 +4151,7 @@ again: btrfs_item_key_to_cpu(l, &key, path->slots[0]); if (key.objectid != device->devid) { + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); btrfs_release_path(path); break; } @@ -4024,15 +4160,16 @@ again: length = btrfs_dev_extent_length(l, dev_extent); if (key.offset + length <= new_size) { + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); btrfs_release_path(path); break; } - chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); btrfs_release_path(path); - ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset); + ret = btrfs_relocate_chunk(root, chunk_offset); + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (ret && ret != -ENOSPC) goto done; if (ret == -ENOSPC) @@ -4045,15 +4182,6 @@ again: goto again; } else if (failed && retried) { ret = -ENOSPC; - lock_chunks(root); - - btrfs_device_set_total_bytes(device, old_size); - if (device->writeable) - device->fs_devices->total_rw_bytes += diff; - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space += diff; - spin_unlock(&root->fs_info->free_chunk_lock); - unlock_chunks(root); goto done; } @@ -4065,6 +4193,36 @@ again: } lock_chunks(root); + + /* + * We checked in the above loop all device extents that were already in + * the device tree. However before we have updated the device's + * total_bytes to the new size, we might have had chunk allocations that + * have not complete yet (new block groups attached to transaction + * handles), and therefore their device extents were not yet in the + * device tree and we missed them in the loop above. So if we have any + * pending chunk using a device extent that overlaps the device range + * that we can not use anymore, commit the current transaction and + * repeat the search on the device tree - this way we guarantee we will + * not have chunks using device extents that end beyond 'new_size'. + */ + if (!checked_pending_chunks) { + u64 start = new_size; + u64 len = old_size - new_size; + + if (contains_pending_extent(trans->transaction, device, + &start, len)) { + unlock_chunks(root); + checked_pending_chunks = true; + failed = 0; + retried = false; + ret = btrfs_commit_transaction(trans, root); + if (ret) + goto done; + goto again; + } + } + btrfs_device_set_disk_total_bytes(device, new_size); if (list_empty(&device->resized_list)) list_add_tail(&device->resized_list, @@ -4079,6 +4237,16 @@ again: btrfs_end_transaction(trans, root); done: btrfs_free_path(path); + if (ret) { + lock_chunks(root); + btrfs_device_set_total_bytes(device, old_size); + if (device->writeable) + device->fs_devices->total_rw_bytes += diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += diff; + spin_unlock(&root->fs_info->free_chunk_lock); + unlock_chunks(root); + } return ret; } @@ -4914,9 +5082,7 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) * and the stripes */ sizeof(u64) * (total_stripes), - GFP_NOFS); - if (!bbio) - return NULL; + GFP_NOFS|__GFP_NOFAIL); atomic_set(&bbio->error, 0); atomic_set(&bbio->refs, 1); @@ -5584,26 +5750,26 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, return 0; } -static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err) +static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) { bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; - bio_endio(bio, err); + bio_endio(bio); btrfs_put_bbio(bbio); } -static void btrfs_end_bio(struct bio *bio, int err) +static void btrfs_end_bio(struct bio *bio) { struct btrfs_bio *bbio = bio->bi_private; - struct btrfs_device *dev = bbio->stripes[0].dev; int is_orig_bio = 0; - if (err) { + if (bio->bi_error) { atomic_inc(&bbio->error); - if (err == -EIO || err == -EREMOTEIO) { + if (bio->bi_error == -EIO || bio->bi_error == -EREMOTEIO) { unsigned int stripe_index = btrfs_io_bio(bio)->stripe_index; + struct btrfs_device *dev; BUG_ON(stripe_index >= bbio->num_stripes); dev = bbio->stripes[stripe_index].dev; @@ -5638,17 +5804,16 @@ static void btrfs_end_bio(struct bio *bio, int err) * beyond the tolerance of the btrfs bio */ if (atomic_read(&bbio->error) > bbio->max_errors) { - err = -EIO; + bio->bi_error = -EIO; } else { /* * this bio is actually up to date, we didn't * go over the max number of errors */ - set_bit(BIO_UPTODATE, &bio->bi_flags); - err = 0; + bio->bi_error = 0; } - btrfs_end_bbio(bbio, bio, err); + btrfs_end_bbio(bbio, bio); } else if (!is_orig_bio) { bio_put(bio); } @@ -5669,7 +5834,7 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, struct btrfs_pending_bios *pending_bios; if (device->missing || !device->bdev) { - bio_endio(bio, -EIO); + bio_io_error(bio); return; } @@ -5714,34 +5879,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, &device->work); } -static int bio_size_ok(struct block_device *bdev, struct bio *bio, - sector_t sector) -{ - struct bio_vec *prev; - struct request_queue *q = bdev_get_queue(bdev); - unsigned int max_sectors = queue_max_sectors(q); - struct bvec_merge_data bvm = { - .bi_bdev = bdev, - .bi_sector = sector, - .bi_rw = bio->bi_rw, - }; - - if (WARN_ON(bio->bi_vcnt == 0)) - return 1; - - prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; - if (bio_sectors(bio) > max_sectors) - return 0; - - if (!q->merge_bvec_fn) - return 1; - - bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len; - if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) - return 0; - return 1; -} - static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, struct bio *bio, u64 physical, int dev_nr, int rw, int async) @@ -5775,38 +5912,6 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, btrfsic_submit_bio(rw, bio); } -static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, - struct bio *first_bio, struct btrfs_device *dev, - int dev_nr, int rw, int async) -{ - struct bio_vec *bvec = first_bio->bi_io_vec; - struct bio *bio; - int nr_vecs = bio_get_nr_vecs(dev->bdev); - u64 physical = bbio->stripes[dev_nr].physical; - -again: - bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS); - if (!bio) - return -ENOMEM; - - while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { - if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, - bvec->bv_offset) < bvec->bv_len) { - u64 len = bio->bi_iter.bi_size; - - atomic_inc(&bbio->stripes_pending); - submit_stripe_bio(root, bbio, bio, physical, dev_nr, - rw, async); - physical += len; - goto again; - } - bvec++; - } - - submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async); - return 0; -} - static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) { atomic_inc(&bbio->error); @@ -5816,8 +5921,8 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_iter.bi_sector = logical >> 9; - - btrfs_end_bbio(bbio, bio, -EIO); + bio->bi_error = -EIO; + btrfs_end_bbio(bbio, bio); } } @@ -5879,18 +5984,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, continue; } - /* - * Check and see if we're ok with this bio based on it's size - * and offset with the given device. - */ - if (!bio_size_ok(dev->bdev, first_bio, - bbio->stripes[dev_nr].physical >> 9)) { - ret = breakup_stripe_bio(root, bbio, first_bio, dev, - dev_nr, rw, async_submit); - BUG_ON(ret); - continue; - } - if (dev_nr < total_devs - 1) { bio = btrfs_bio_clone(first_bio, GFP_NOFS); BUG_ON(!bio); /* -ENOMEM */ @@ -6072,6 +6165,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, free_extent_map(em); return -EIO; } + btrfs_warn(root->fs_info, "devid %llu uuid %pU is missing", + devid, uuid); } map->stripes[i].dev->in_fs_metadata = 1; } @@ -6191,10 +6286,11 @@ static int read_one_dev(struct btrfs_root *root, if (!btrfs_test_opt(root, DEGRADED)) return -EIO; - btrfs_warn(root->fs_info, "devid %llu missing", devid); device = add_missing_dev(root, fs_devices, devid, dev_uuid); if (!device) return -ENOMEM; + btrfs_warn(root->fs_info, "devid %llu uuid %pU missing", + devid, dev_uuid); } else { if (!device->bdev && !btrfs_test_opt(root, DEGRADED)) return -EIO; @@ -6722,3 +6818,21 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, } unlock_chunks(root); } + +void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + while (fs_devices) { + fs_devices->fs_info = fs_info; + fs_devices = fs_devices->seed; + } +} + +void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + while (fs_devices) { + fs_devices->fs_info = NULL; + fs_devices = fs_devices->seed; + } +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index cedae0356558..2ca784a14e84 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -253,6 +253,12 @@ struct btrfs_fs_devices { * nonrot flag set */ int rotating; + + struct btrfs_fs_info *fs_info; + /* sysfs kobjects */ + struct kobject super_kobj; + struct kobject *device_dir_kobj; + struct completion kobj_unregister; }; #define BTRFS_BIO_INLINE_CSUM_SIZE 64 @@ -447,6 +453,9 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); +int find_free_dev_extent_start(struct btrfs_transaction *transaction, + struct btrfs_device *device, u64 num_bytes, + u64 search_start, u64 *start, u64 *max_avail); int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); @@ -535,5 +544,8 @@ static inline void unlock_chunks(struct btrfs_root *root) mutex_unlock(&root->fs_info->chunk_mutex); } +struct list_head *btrfs_get_fs_uuids(void); +void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); +void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/buffer.c b/fs/buffer.c index 1cf7a53a0277..82283abb2795 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2957,14 +2957,14 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block, } EXPORT_SYMBOL(generic_block_bmap); -static void end_bio_bh_io_sync(struct bio *bio, int err) +static void end_bio_bh_io_sync(struct bio *bio) { struct buffer_head *bh = bio->bi_private; - if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) + if (unlikely(bio_flagged(bio, BIO_QUIET))) set_bit(BH_Quiet, &bh->b_state); - bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); + bh->b_end_io(bh, !bio->bi_error); bio_put(bio); } @@ -3046,12 +3046,9 @@ static int submit_bh_wbc(int rw, struct buffer_head *bh, bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; - bio->bi_io_vec[0].bv_page = bh->b_page; - bio->bi_io_vec[0].bv_len = bh->b_size; - bio->bi_io_vec[0].bv_offset = bh_offset(bh); - bio->bi_vcnt = 1; - bio->bi_iter.bi_size = bh->b_size; + bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); + BUG_ON(bio->bi_iter.bi_size != bh->b_size); bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 8c52472d2efa..aecd0859eacb 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -43,7 +43,6 @@ struct cachefiles_object { loff_t i_size; /* object size */ unsigned long flags; #define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */ -#define CACHEFILES_OBJECT_BURIED 1 /* T if preemptively buried */ atomic_t usage; /* object usage count */ uint8_t type; /* object type */ uint8_t new; /* T if object new */ diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index ab857ab9f40d..fc1056f5c96a 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -97,7 +97,8 @@ static noinline void cachefiles_printk_object(struct cachefiles_object *object, * call vfs_unlink(), vfs_rmdir() or vfs_rename() */ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache, - struct dentry *dentry) + struct dentry *dentry, + enum fscache_why_object_killed why) { struct cachefiles_object *object; struct rb_node *p; @@ -132,8 +133,9 @@ found_dentry: pr_err("\n"); pr_err("Error: Can't preemptively bury live object\n"); cachefiles_printk_object(object, NULL); - } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { - pr_err("Error: Object already preemptively buried\n"); + } else { + if (why != FSCACHE_OBJECT_IS_STALE) + fscache_object_mark_killed(&object->fscache, why); } write_unlock(&cache->active_lock); @@ -265,7 +267,8 @@ requeue: static int cachefiles_bury_object(struct cachefiles_cache *cache, struct dentry *dir, struct dentry *rep, - bool preemptive) + bool preemptive, + enum fscache_why_object_killed why) { struct dentry *grave, *trap; struct path path, path_to_graveyard; @@ -289,7 +292,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, ret = vfs_unlink(d_inode(dir), rep, NULL); if (preemptive) - cachefiles_mark_object_buried(cache, rep); + cachefiles_mark_object_buried(cache, rep, why); } mutex_unlock(&d_inode(dir)->i_mutex); @@ -394,7 +397,7 @@ try_again: "Rename failed with error %d", ret); if (preemptive) - cachefiles_mark_object_buried(cache, rep); + cachefiles_mark_object_buried(cache, rep, why); } unlock_rename(cache->graveyard, dir); @@ -422,7 +425,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); - if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { + if (test_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->fscache.flags)) { /* object allocation for the same key preemptively deleted this * object's file so that it could create its own file */ _debug("object preemptively buried"); @@ -433,7 +436,8 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, * may have been renamed */ if (dir == object->dentry->d_parent) { ret = cachefiles_bury_object(cache, dir, - object->dentry, false); + object->dentry, false, + FSCACHE_OBJECT_WAS_RETIRED); } else { /* it got moved, presumably by cachefilesd culling it, * so it's no longer in the key path and we can ignore @@ -522,7 +526,7 @@ lookup_again: if (d_is_negative(next)) { ret = cachefiles_has_space(cache, 1, 0); if (ret < 0) - goto create_error; + goto no_space_error; path.dentry = dir; ret = security_path_mkdir(&path, next, 0); @@ -551,7 +555,7 @@ lookup_again: if (d_is_negative(next)) { ret = cachefiles_has_space(cache, 1, 0); if (ret < 0) - goto create_error; + goto no_space_error; path.dentry = dir; ret = security_path_mknod(&path, next, S_IFREG, 0); @@ -602,7 +606,8 @@ lookup_again: * mutex) */ object->dentry = NULL; - ret = cachefiles_bury_object(cache, dir, next, true); + ret = cachefiles_bury_object(cache, dir, next, true, + FSCACHE_OBJECT_IS_STALE); dput(next); next = NULL; @@ -610,6 +615,7 @@ lookup_again: goto delete_error; _debug("redo lookup"); + fscache_object_retrying_stale(&object->fscache); goto lookup_again; } } @@ -662,6 +668,8 @@ lookup_again: _leave(" = 0 [%lu]", d_backing_inode(object->dentry)->i_ino); return 0; +no_space_error: + fscache_object_mark_killed(&object->fscache, FSCACHE_OBJECT_NO_SPACE); create_error: _debug("create error %d", ret); if (ret == -EIO) @@ -927,7 +935,8 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, /* actually remove the victim (drops the dir mutex) */ _debug("bury"); - ret = cachefiles_bury_object(cache, dir, victim, false); + ret = cachefiles_bury_object(cache, dir, victim, false, + FSCACHE_OBJECT_WAS_CULLED); if (ret < 0) goto error; diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 64fa248343f6..8f84646f10e9 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -187,10 +187,10 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, val_size2 = posix_acl_xattr_size(default_acl->a_count); err = -ENOMEM; - tmp_buf = kmalloc(max(val_size1, val_size2), GFP_NOFS); + tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL); if (!tmp_buf) goto out_err; - pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_NOFS); + pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL); if (!pagelist) goto out_err; ceph_pagelist_init(pagelist); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e162bcd105ee..a268abfe60ac 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -87,17 +87,21 @@ static int ceph_set_page_dirty(struct page *page) inode = mapping->host; ci = ceph_inode(inode); - /* - * Note that we're grabbing a snapc ref here without holding - * any locks! - */ - snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); - /* dirty the head */ spin_lock(&ci->i_ceph_lock); - if (ci->i_head_snapc == NULL) - ci->i_head_snapc = ceph_get_snap_context(snapc); - ++ci->i_wrbuffer_ref_head; + BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + snapc = ceph_get_snap_context(capsnap->context); + capsnap->dirty_pages++; + } else { + BUG_ON(!ci->i_head_snapc); + snapc = ceph_get_snap_context(ci->i_head_snapc); + ++ci->i_wrbuffer_ref_head; + } if (ci->i_wrbuffer_ref == 0) ihold(inode); ++ci->i_wrbuffer_ref; @@ -346,7 +350,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) /* build page vector */ nr_pages = calc_pages_for(0, len); - pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); + pages = kmalloc(sizeof(*pages) * nr_pages, GFP_KERNEL); ret = -ENOMEM; if (!pages) goto out; @@ -358,7 +362,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) dout("start_read %p adding %p idx %lu\n", inode, page, page->index); if (add_to_page_cache_lru(page, &inode->i_data, page->index, - GFP_NOFS)) { + GFP_KERNEL)) { ceph_fscache_uncache_page(inode, page); page_cache_release(page); dout("start_read %p add_to_page_cache failed %p\n", @@ -436,7 +440,7 @@ out: * only snap context we are allowed to write back. */ static struct ceph_snap_context *get_oldest_context(struct inode *inode, - u64 *snap_size) + loff_t *snap_size) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc = NULL; @@ -476,8 +480,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct ceph_osd_client *osdc; struct ceph_snap_context *snapc, *oldest; loff_t page_off = page_offset(page); + loff_t snap_size = -1; long writeback_stat; - u64 truncate_size, snap_size = 0; + u64 truncate_size; u32 truncate_seq; int err = 0, len = PAGE_CACHE_SIZE; @@ -512,7 +517,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) spin_lock(&ci->i_ceph_lock); truncate_seq = ci->i_truncate_seq; truncate_size = ci->i_truncate_size; - if (!snap_size) + if (snap_size == -1) snap_size = i_size_read(inode); spin_unlock(&ci->i_ceph_lock); @@ -695,7 +700,8 @@ static int ceph_writepages_start(struct address_space *mapping, unsigned wsize = 1 << inode->i_blkbits; struct ceph_osd_request *req = NULL; int do_sync = 0; - u64 truncate_size, snap_size; + loff_t snap_size, i_size; + u64 truncate_size; u32 truncate_seq; /* @@ -741,7 +747,7 @@ static int ceph_writepages_start(struct address_space *mapping, retry: /* find oldest snap context with dirty data */ ceph_put_snap_context(snapc); - snap_size = 0; + snap_size = -1; snapc = get_oldest_context(inode, &snap_size); if (!snapc) { /* hmm, why does writepages get called when there @@ -749,16 +755,13 @@ retry: dout(" no snap context with dirty data?\n"); goto out; } - if (snap_size == 0) - snap_size = i_size_read(inode); dout(" oldest snapc is %p seq %lld (%d snaps)\n", snapc, snapc->seq, snapc->num_snaps); spin_lock(&ci->i_ceph_lock); truncate_seq = ci->i_truncate_seq; truncate_size = ci->i_truncate_size; - if (!snap_size) - snap_size = i_size_read(inode); + i_size = i_size_read(inode); spin_unlock(&ci->i_ceph_lock); if (last_snapc && snapc != last_snapc) { @@ -828,8 +831,10 @@ get_more_pages: dout("waiting on writeback %p\n", page); wait_on_page_writeback(page); } - if (page_offset(page) >= snap_size) { - dout("%p page eof %llu\n", page, snap_size); + if (page_offset(page) >= + (snap_size == -1 ? i_size : snap_size)) { + dout("%p page eof %llu\n", page, + (snap_size == -1 ? i_size : snap_size)); done = 1; unlock_page(page); break; @@ -884,7 +889,8 @@ get_more_pages: } if (do_sync) - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); + osd_req_op_init(req, 1, + CEPH_OSD_OP_STARTSYNC, 0); req->r_callback = writepages_finish; req->r_inode = inode; @@ -944,10 +950,18 @@ get_more_pages: } /* Format the osd request message and submit the write */ - offset = page_offset(pages[0]); - len = min(snap_size - offset, - (u64)locked_pages << PAGE_CACHE_SHIFT); + len = (u64)locked_pages << PAGE_CACHE_SHIFT; + if (snap_size == -1) { + len = min(len, (u64)i_size_read(inode) - offset); + /* writepages_finish() clears writeback pages + * according to the data length, so make sure + * data length covers all locked pages */ + len = max(len, 1 + + ((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT)); + } else { + len = min(len, snap_size - offset); + } dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); @@ -1032,7 +1046,6 @@ static int ceph_update_writeable_page(struct file *file, { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; loff_t page_off = pos & PAGE_CACHE_MASK; int pos_in_page = pos & ~PAGE_CACHE_MASK; int end_in_page = pos_in_page + len; @@ -1044,10 +1057,6 @@ retry_locked: /* writepages currently holds page lock, but if we change that later, */ wait_on_page_writeback(page); - /* check snap context */ - BUG_ON(!ci->i_snap_realm); - down_read(&mdsc->snap_rwsem); - BUG_ON(!ci->i_snap_realm->cached_context); snapc = page_snap_context(page); if (snapc && snapc != ci->i_head_snapc) { /* @@ -1055,7 +1064,6 @@ retry_locked: * context! is it writeable now? */ oldest = get_oldest_context(inode, NULL); - up_read(&mdsc->snap_rwsem); if (snapc->seq > oldest->seq) { ceph_put_snap_context(oldest); @@ -1112,7 +1120,6 @@ retry_locked: } /* we need to read it. */ - up_read(&mdsc->snap_rwsem); r = readpage_nounlock(file, page); if (r < 0) goto fail_nosnap; @@ -1157,16 +1164,13 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, /* * we don't do anything in here that simple_write_end doesn't do - * except adjust dirty page accounting and drop read lock on - * mdsc->snap_rwsem. + * except adjust dirty page accounting */ static int ceph_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct inode *inode = file_inode(file); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_mds_client *mdsc = fsc->mdsc; unsigned from = pos & (PAGE_CACHE_SIZE - 1); int check_cap = 0; @@ -1188,7 +1192,6 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, set_page_dirty(page); unlock_page(page); - up_read(&mdsc->snap_rwsem); page_cache_release(page); if (check_cap) @@ -1314,13 +1317,17 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = file_inode(vma->vm_file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_cap_flush *prealloc_cf; struct page *page = vmf->page; loff_t off = page_offset(page); loff_t size = i_size_read(inode); size_t len; int want, got, ret; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return VM_FAULT_SIGBUS; + if (ci->i_inline_version != CEPH_INLINE_NONE) { struct page *locked_page = NULL; if (off == 0) { @@ -1330,8 +1337,10 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = ceph_uninline_data(vma->vm_file, locked_page); if (locked_page) unlock_page(locked_page); - if (ret < 0) - return VM_FAULT_SIGBUS; + if (ret < 0) { + ret = VM_FAULT_SIGBUS; + goto out_free; + } } if (off + PAGE_CACHE_SIZE <= size) @@ -1353,7 +1362,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) break; if (ret != -ERESTARTSYS) { WARN_ON(1); - return VM_FAULT_SIGBUS; + ret = VM_FAULT_SIGBUS; + goto out_free; } } dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", @@ -1373,7 +1383,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret == 0) { /* success. we'll keep the page locked. */ set_page_dirty(page); - up_read(&mdsc->snap_rwsem); ret = VM_FAULT_LOCKED; } else { if (ret == -ENOMEM) @@ -1389,7 +1398,8 @@ out: int dirty; spin_lock(&ci->i_ceph_lock); ci->i_inline_version = CEPH_INLINE_NONE; - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &prealloc_cf); spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); @@ -1398,6 +1408,8 @@ out: dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n", inode, off, len, ceph_cap_string(got), ret); ceph_put_cap_refs(ci, got); +out_free: + ceph_free_cap_flush(prealloc_cf); return ret; } @@ -1509,8 +1521,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ceph_vino(inode), 0, &len, 0, 1, CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, - ci->i_snap_realm->cached_context, - 0, 0, false); + ceph_empty_snapc, 0, 0, false); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; @@ -1528,7 +1539,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ceph_vino(inode), 0, &len, 1, 3, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, - ci->i_snap_realm->cached_context, + ceph_empty_snapc, ci->i_truncate_seq, ci->i_truncate_size, false); if (IS_ERR(req)) { @@ -1582,7 +1593,7 @@ out: return err; } -static struct vm_operations_struct ceph_vmops = { +static const struct vm_operations_struct ceph_vmops = { .fault = ceph_filemap_fault, .page_mkwrite = ceph_page_mkwrite, }; @@ -1597,3 +1608,206 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_ops = &ceph_vmops; return 0; } + +enum { + POOL_READ = 1, + POOL_WRITE = 2, +}; + +static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) +{ + struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_osd_request *rd_req = NULL, *wr_req = NULL; + struct rb_node **p, *parent; + struct ceph_pool_perm *perm; + struct page **pages; + int err = 0, err2 = 0, have = 0; + + down_read(&mdsc->pool_perm_rwsem); + p = &mdsc->pool_perm_tree.rb_node; + while (*p) { + perm = rb_entry(*p, struct ceph_pool_perm, node); + if (pool < perm->pool) + p = &(*p)->rb_left; + else if (pool > perm->pool) + p = &(*p)->rb_right; + else { + have = perm->perm; + break; + } + } + up_read(&mdsc->pool_perm_rwsem); + if (*p) + goto out; + + dout("__ceph_pool_perm_get pool %u no perm cached\n", pool); + + down_write(&mdsc->pool_perm_rwsem); + parent = NULL; + while (*p) { + parent = *p; + perm = rb_entry(parent, struct ceph_pool_perm, node); + if (pool < perm->pool) + p = &(*p)->rb_left; + else if (pool > perm->pool) + p = &(*p)->rb_right; + else { + have = perm->perm; + break; + } + } + if (*p) { + up_write(&mdsc->pool_perm_rwsem); + goto out; + } + + rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, + ceph_empty_snapc, + 1, false, GFP_NOFS); + if (!rd_req) { + err = -ENOMEM; + goto out_unlock; + } + + rd_req->r_flags = CEPH_OSD_FLAG_READ; + osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0); + rd_req->r_base_oloc.pool = pool; + snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name), + "%llx.00000000", ci->i_vino.ino); + rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name); + + wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, + ceph_empty_snapc, + 1, false, GFP_NOFS); + if (!wr_req) { + err = -ENOMEM; + goto out_unlock; + } + + wr_req->r_flags = CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; + osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); + wr_req->r_base_oloc.pool = pool; + wr_req->r_base_oid = rd_req->r_base_oid; + + /* one page should be large enough for STAT data */ + pages = ceph_alloc_page_vector(1, GFP_KERNEL); + if (IS_ERR(pages)) { + err = PTR_ERR(pages); + goto out_unlock; + } + + osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, + 0, false, true); + ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP, + &ci->vfs_inode.i_mtime); + err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); + + ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP, + &ci->vfs_inode.i_mtime); + err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); + + if (!err) + err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req); + if (!err2) + err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req); + + if (err >= 0 || err == -ENOENT) + have |= POOL_READ; + else if (err != -EPERM) + goto out_unlock; + + if (err2 == 0 || err2 == -EEXIST) + have |= POOL_WRITE; + else if (err2 != -EPERM) { + err = err2; + goto out_unlock; + } + + perm = kmalloc(sizeof(*perm), GFP_NOFS); + if (!perm) { + err = -ENOMEM; + goto out_unlock; + } + + perm->pool = pool; + perm->perm = have; + rb_link_node(&perm->node, parent, p); + rb_insert_color(&perm->node, &mdsc->pool_perm_tree); + err = 0; +out_unlock: + up_write(&mdsc->pool_perm_rwsem); + + if (rd_req) + ceph_osdc_put_request(rd_req); + if (wr_req) + ceph_osdc_put_request(wr_req); +out: + if (!err) + err = have; + dout("__ceph_pool_perm_get pool %u result = %d\n", pool, err); + return err; +} + +int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) +{ + u32 pool; + int ret, flags; + + if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), + NOPOOLPERM)) + return 0; + + spin_lock(&ci->i_ceph_lock); + flags = ci->i_ceph_flags; + pool = ceph_file_layout_pg_pool(ci->i_layout); + spin_unlock(&ci->i_ceph_lock); +check: + if (flags & CEPH_I_POOL_PERM) { + if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) { + dout("ceph_pool_perm_check pool %u no read perm\n", + pool); + return -EPERM; + } + if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) { + dout("ceph_pool_perm_check pool %u no write perm\n", + pool); + return -EPERM; + } + return 0; + } + + ret = __ceph_pool_perm_get(ci, pool); + if (ret < 0) + return ret; + + flags = CEPH_I_POOL_PERM; + if (ret & POOL_READ) + flags |= CEPH_I_POOL_RD; + if (ret & POOL_WRITE) + flags |= CEPH_I_POOL_WR; + + spin_lock(&ci->i_ceph_lock); + if (pool == ceph_file_layout_pg_pool(ci->i_layout)) { + ci->i_ceph_flags = flags; + } else { + pool = ceph_file_layout_pg_pool(ci->i_layout); + flags = ci->i_ceph_flags; + } + spin_unlock(&ci->i_ceph_lock); + goto check; +} + +void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc) +{ + struct ceph_pool_perm *perm; + struct rb_node *n; + + while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) { + n = rb_first(&mdsc->pool_perm_tree); + perm = rb_entry(n, struct ceph_pool_perm, node); + rb_erase(n, &mdsc->pool_perm_tree); + kfree(perm); + } +} diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index be5ea6af8366..ddd5e9471290 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -833,7 +833,9 @@ int __ceph_caps_used(struct ceph_inode_info *ci) used |= CEPH_CAP_PIN; if (ci->i_rd_ref) used |= CEPH_CAP_FILE_RD; - if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages) + if (ci->i_rdcache_ref || + (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */ + ci->vfs_inode.i_data.nrpages)) used |= CEPH_CAP_FILE_CACHE; if (ci->i_wr_ref) used |= CEPH_CAP_FILE_WR; @@ -926,16 +928,6 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) /* remove from session list */ spin_lock(&session->s_cap_lock); - /* - * s_cap_reconnect is protected by s_cap_lock. no one changes - * s_cap_gen while session is in the reconnect state. - */ - if (queue_release && - (!session->s_cap_reconnect || - cap->cap_gen == session->s_cap_gen)) - __queue_cap_release(session, ci->i_vino.ino, cap->cap_id, - cap->mseq, cap->issue_seq); - if (session->s_cap_iterator == cap) { /* not yet, we are iterating over this very cap */ dout("__ceph_remove_cap delaying %p removal from session %p\n", @@ -948,6 +940,25 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) } /* protect backpointer with s_cap_lock: see iterate_session_caps */ cap->ci = NULL; + + /* + * s_cap_reconnect is protected by s_cap_lock. no one changes + * s_cap_gen while session is in the reconnect state. + */ + if (queue_release && + (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) { + cap->queue_release = 1; + if (removed) { + list_add_tail(&cap->session_caps, + &session->s_cap_releases); + session->s_num_cap_releases++; + removed = 0; + } + } else { + cap->queue_release = 0; + } + cap->cap_ino = ci->i_vino.ino; + spin_unlock(&session->s_cap_lock); /* remove from inode list */ @@ -977,8 +988,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) static int send_cap_msg(struct ceph_mds_session *session, u64 ino, u64 cid, int op, int caps, int wanted, int dirty, - u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq, - u64 size, u64 max_size, + u32 seq, u64 flush_tid, u64 oldest_flush_tid, + u32 issue_seq, u32 mseq, u64 size, u64 max_size, struct timespec *mtime, struct timespec *atime, u64 time_warp_seq, kuid_t uid, kgid_t gid, umode_t mode, @@ -992,20 +1003,23 @@ static int send_cap_msg(struct ceph_mds_session *session, size_t extra_len; dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" - " seq %u/%u mseq %u follows %lld size %llu/%llu" + " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu" " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op), cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted), ceph_cap_string(dirty), - seq, issue_seq, mseq, follows, size, max_size, + seq, issue_seq, flush_tid, oldest_flush_tid, + mseq, follows, size, max_size, xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); - /* flock buffer size + inline version + inline data size */ - extra_len = 4 + 8 + 4; + /* flock buffer size + inline version + inline data size + + * osd_epoch_barrier + oldest_flush_tid */ + extra_len = 4 + 8 + 4 + 4 + 8; msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, GFP_NOFS, false); if (!msg) return -ENOMEM; + msg->hdr.version = cpu_to_le16(6); msg->hdr.tid = cpu_to_le64(flush_tid); fc = msg->front.iov_base; @@ -1041,6 +1055,10 @@ static int send_cap_msg(struct ceph_mds_session *session, ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE); /* inline data size */ ceph_encode_32(&p, 0); + /* osd_epoch_barrier */ + ceph_encode_32(&p, 0); + /* oldest_flush_tid */ + ceph_encode_64(&p, oldest_flush_tid); fc->xattr_version = cpu_to_le64(xattr_version); if (xattrs_buf) { @@ -1053,44 +1071,6 @@ static int send_cap_msg(struct ceph_mds_session *session, return 0; } -void __queue_cap_release(struct ceph_mds_session *session, - u64 ino, u64 cap_id, u32 migrate_seq, - u32 issue_seq) -{ - struct ceph_msg *msg; - struct ceph_mds_cap_release *head; - struct ceph_mds_cap_item *item; - - BUG_ON(!session->s_num_cap_releases); - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); - - dout(" adding %llx release to mds%d msg %p (%d left)\n", - ino, session->s_mds, msg, session->s_num_cap_releases); - - BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); - head = msg->front.iov_base; - le32_add_cpu(&head->num, 1); - item = msg->front.iov_base + msg->front.iov_len; - item->ino = cpu_to_le64(ino); - item->cap_id = cpu_to_le64(cap_id); - item->migrate_seq = cpu_to_le32(migrate_seq); - item->seq = cpu_to_le32(issue_seq); - - session->s_num_cap_releases--; - - msg->front.iov_len += sizeof(*item); - if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { - dout(" release msg %p full\n", msg); - list_move_tail(&msg->list_head, &session->s_cap_releases_done); - } else { - dout(" release msg %p at %d/%d (%d)\n", msg, - (int)le32_to_cpu(head->num), - (int)CEPH_CAPS_PER_RELEASE, - (int)msg->front.iov_len); - } -} - /* * Queue cap releases when an inode is dropped from our cache. Since * inode is about to be destroyed, there is no need for i_ceph_lock. @@ -1127,7 +1107,7 @@ void ceph_queue_caps_release(struct inode *inode) */ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, int op, int used, int want, int retain, int flushing, - unsigned *pflush_tid) + u64 flush_tid, u64 oldest_flush_tid) __releases(cap->ci->i_ceph_lock) { struct ceph_inode_info *ci = cap->ci; @@ -1145,8 +1125,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, u64 xattr_version = 0; struct ceph_buffer *xattr_blob = NULL; int delayed = 0; - u64 flush_tid = 0; - int i; int ret; bool inline_data; @@ -1190,26 +1168,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, cap->implemented &= cap->issued | used; cap->mds_wanted = want; - if (flushing) { - /* - * assign a tid for flush operations so we can avoid - * flush1 -> dirty1 -> flush2 -> flushack1 -> mark - * clean type races. track latest tid for every bit - * so we can handle flush AxFw, flush Fw, and have the - * first ack clean Ax. - */ - flush_tid = ++ci->i_cap_flush_last_tid; - if (pflush_tid) - *pflush_tid = flush_tid; - dout(" cap_flush_tid %d\n", (int)flush_tid); - for (i = 0; i < CEPH_CAP_BITS; i++) - if (flushing & (1 << i)) - ci->i_cap_flush_tid[i] = flush_tid; - - follows = ci->i_head_snapc->seq; - } else { - follows = 0; - } + follows = flushing ? ci->i_head_snapc->seq : 0; keep = cap->implemented; seq = cap->seq; @@ -1237,7 +1196,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, spin_unlock(&ci->i_ceph_lock); ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, - op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, + op, keep, want, flushing, seq, + flush_tid, oldest_flush_tid, issue_seq, mseq, size, max_size, &mtime, &atime, time_warp_seq, uid, gid, mode, xattr_version, xattr_blob, follows, inline_data); @@ -1259,14 +1219,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, * asynchronously back to the MDS once sync writes complete and dirty * data is written out. * - * Unless @again is true, skip cap_snaps that were already sent to + * Unless @kick is true, skip cap_snaps that were already sent to * the MDS (i.e., during this session). * * Called under i_ceph_lock. Takes s_mutex as needed. */ void __ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession, - int again) + int kick) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { @@ -1297,11 +1257,8 @@ retry: if (capsnap->dirty_pages || capsnap->writing) break; - /* - * if cap writeback already occurred, we should have dropped - * the capsnap in ceph_put_wrbuffer_cap_refs. - */ - BUG_ON(capsnap->dirty == 0); + /* should be removed by ceph_try_drop_cap_snap() */ + BUG_ON(!capsnap->need_flush); /* pick mds, take s_mutex */ if (ci->i_auth_cap == NULL) { @@ -1310,7 +1267,7 @@ retry: } /* only flush each capsnap once */ - if (!again && !list_empty(&capsnap->flushing_item)) { + if (!kick && !list_empty(&capsnap->flushing_item)) { dout("already flushed %p, skipping\n", capsnap); continue; } @@ -1320,6 +1277,9 @@ retry: if (session && session->s_mds != mds) { dout("oops, wrong session %p mutex\n", session); + if (kick) + goto out; + mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); session = NULL; @@ -1343,20 +1303,22 @@ retry: goto retry; } - capsnap->flush_tid = ++ci->i_cap_flush_last_tid; + spin_lock(&mdsc->cap_dirty_lock); + capsnap->flush_tid = ++mdsc->last_cap_flush_tid; + spin_unlock(&mdsc->cap_dirty_lock); + atomic_inc(&capsnap->nref); - if (!list_empty(&capsnap->flushing_item)) - list_del_init(&capsnap->flushing_item); - list_add_tail(&capsnap->flushing_item, - &session->s_cap_snaps_flushing); + if (list_empty(&capsnap->flushing_item)) + list_add_tail(&capsnap->flushing_item, + &session->s_cap_snaps_flushing); spin_unlock(&ci->i_ceph_lock); dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", inode, capsnap, capsnap->follows, capsnap->flush_tid); send_cap_msg(session, ceph_vino(inode).ino, 0, CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, - capsnap->dirty, 0, capsnap->flush_tid, 0, mseq, - capsnap->size, 0, + capsnap->dirty, 0, capsnap->flush_tid, 0, + 0, mseq, capsnap->size, 0, &capsnap->mtime, &capsnap->atime, capsnap->time_warp_seq, capsnap->uid, capsnap->gid, capsnap->mode, @@ -1396,7 +1358,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci) * Caller is then responsible for calling __mark_inode_dirty with the * returned flags value. */ -int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) +int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, + struct ceph_cap_flush **pcf) { struct ceph_mds_client *mdsc = ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; @@ -1416,9 +1379,14 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) ceph_cap_string(was | mask)); ci->i_dirty_caps |= mask; if (was == 0) { - if (!ci->i_head_snapc) + WARN_ON_ONCE(ci->i_prealloc_cap_flush); + swap(ci->i_prealloc_cap_flush, *pcf); + + if (!ci->i_head_snapc) { + WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem)); ci->i_head_snapc = ceph_get_snap_context( ci->i_snap_realm->cached_context); + } dout(" inode %p now dirty snapc %p auth cap %p\n", &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); @@ -1429,6 +1397,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) ihold(inode); dirty |= I_DIRTY_SYNC; } + } else { + WARN_ON_ONCE(!ci->i_prealloc_cap_flush); } BUG_ON(list_empty(&ci->i_dirty_item)); if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && @@ -1438,6 +1408,74 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) return dirty; } +static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci, + struct ceph_cap_flush *cf) +{ + struct rb_node **p = &ci->i_cap_flush_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_cap_flush *other = NULL; + + while (*p) { + parent = *p; + other = rb_entry(parent, struct ceph_cap_flush, i_node); + + if (cf->tid < other->tid) + p = &(*p)->rb_left; + else if (cf->tid > other->tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&cf->i_node, parent, p); + rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree); +} + +static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc, + struct ceph_cap_flush *cf) +{ + struct rb_node **p = &mdsc->cap_flush_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_cap_flush *other = NULL; + + while (*p) { + parent = *p; + other = rb_entry(parent, struct ceph_cap_flush, g_node); + + if (cf->tid < other->tid) + p = &(*p)->rb_left; + else if (cf->tid > other->tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&cf->g_node, parent, p); + rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree); +} + +struct ceph_cap_flush *ceph_alloc_cap_flush(void) +{ + return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); +} + +void ceph_free_cap_flush(struct ceph_cap_flush *cf) +{ + if (cf) + kmem_cache_free(ceph_cap_flush_cachep, cf); +} + +static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc) +{ + struct rb_node *n = rb_first(&mdsc->cap_flush_tree); + if (n) { + struct ceph_cap_flush *cf = + rb_entry(n, struct ceph_cap_flush, g_node); + return cf->tid; + } + return 0; +} + /* * Add dirty inode to the flushing list. Assigned a seq number so we * can wait for caps to flush without starving. @@ -1445,14 +1483,17 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) * Called under i_ceph_lock. */ static int __mark_caps_flushing(struct inode *inode, - struct ceph_mds_session *session) + struct ceph_mds_session *session, + u64 *flush_tid, u64 *oldest_flush_tid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap_flush *cf = NULL; int flushing; BUG_ON(ci->i_dirty_caps == 0); BUG_ON(list_empty(&ci->i_dirty_item)); + BUG_ON(!ci->i_prealloc_cap_flush); flushing = ci->i_dirty_caps; dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n", @@ -1463,22 +1504,30 @@ static int __mark_caps_flushing(struct inode *inode, ci->i_dirty_caps = 0; dout(" inode %p now !dirty\n", inode); + swap(cf, ci->i_prealloc_cap_flush); + cf->caps = flushing; + spin_lock(&mdsc->cap_dirty_lock); list_del_init(&ci->i_dirty_item); + cf->tid = ++mdsc->last_cap_flush_tid; + __add_cap_flushing_to_mdsc(mdsc, cf); + *oldest_flush_tid = __get_oldest_flush_tid(mdsc); + if (list_empty(&ci->i_flushing_item)) { - ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); mdsc->num_cap_flushing++; - dout(" inode %p now flushing seq %lld\n", inode, - ci->i_cap_flush_seq); + dout(" inode %p now flushing tid %llu\n", inode, cf->tid); } else { list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); - dout(" inode %p now flushing (more) seq %lld\n", inode, - ci->i_cap_flush_seq); + dout(" inode %p now flushing (more) tid %llu\n", + inode, cf->tid); } spin_unlock(&mdsc->cap_dirty_lock); + __add_cap_flushing_to_inode(ci, cf); + + *flush_tid = cf->tid; return flushing; } @@ -1524,6 +1573,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; + u64 flush_tid, oldest_flush_tid; int file_wanted, used, cap_used; int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ int issued, implemented, want, retain, revoking, flushing = 0; @@ -1553,13 +1603,13 @@ retry: retry_locked: file_wanted = __ceph_caps_file_wanted(ci); used = __ceph_caps_used(ci); - want = file_wanted | used; issued = __ceph_caps_issued(ci, &implemented); revoking = implemented & ~issued; - retain = want | CEPH_CAP_PIN; + want = file_wanted; + retain = file_wanted | used | CEPH_CAP_PIN; if (!mdsc->stopping && inode->i_nlink > 0) { - if (want) { + if (file_wanted) { retain |= CEPH_CAP_ANY; /* be greedy */ } else if (S_ISDIR(inode->i_mode) && (issued & CEPH_CAP_FILE_SHARED) && @@ -1602,9 +1652,10 @@ retry_locked: * If we fail, it's because pages are locked.... try again later. */ if ((!is_delayed || mdsc->stopping) && - ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ - inode->i_data.nrpages && /* have cached pages */ - (file_wanted == 0 || /* no open files */ + !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ + ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ + inode->i_data.nrpages && /* have cached pages */ + (file_wanted == 0 || /* no open files */ (revoking & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ !tried_invalidate) { @@ -1742,17 +1793,25 @@ ack: took_snap_rwsem = 1; } - if (cap == ci->i_auth_cap && ci->i_dirty_caps) - flushing = __mark_caps_flushing(inode, session); - else + if (cap == ci->i_auth_cap && ci->i_dirty_caps) { + flushing = __mark_caps_flushing(inode, session, + &flush_tid, + &oldest_flush_tid); + } else { flushing = 0; + flush_tid = 0; + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); + } mds = cap->mds; /* remember mds, so we don't repeat */ sent++; /* __send_cap drops i_ceph_lock */ delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, - want, retain, flushing, NULL); + want, retain, flushing, + flush_tid, oldest_flush_tid); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } @@ -1781,12 +1840,13 @@ ack: /* * Try to flush dirty caps back to the auth mds. */ -static int try_flush_caps(struct inode *inode, unsigned *flush_tid) +static int try_flush_caps(struct inode *inode, u64 *ptid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - int flushing = 0; struct ceph_mds_session *session = NULL; + int flushing = 0; + u64 flush_tid = 0, oldest_flush_tid = 0; retry: spin_lock(&ci->i_ceph_lock); @@ -1811,42 +1871,54 @@ retry: if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) goto out; - flushing = __mark_caps_flushing(inode, session); + flushing = __mark_caps_flushing(inode, session, &flush_tid, + &oldest_flush_tid); /* __send_cap drops i_ceph_lock */ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, - cap->issued | cap->implemented, flushing, - flush_tid); - if (!delayed) - goto out_unlocked; + (cap->issued | cap->implemented), + flushing, flush_tid, oldest_flush_tid); - spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci); + if (delayed) { + spin_lock(&ci->i_ceph_lock); + __cap_delay_requeue(mdsc, ci); + spin_unlock(&ci->i_ceph_lock); + } + } else { + struct rb_node *n = rb_last(&ci->i_cap_flush_tree); + if (n) { + struct ceph_cap_flush *cf = + rb_entry(n, struct ceph_cap_flush, i_node); + flush_tid = cf->tid; + } + flushing = ci->i_flushing_caps; + spin_unlock(&ci->i_ceph_lock); } out: - spin_unlock(&ci->i_ceph_lock); -out_unlocked: if (session) mutex_unlock(&session->s_mutex); + + *ptid = flush_tid; return flushing; } /* * Return true if we've flushed caps through the given flush_tid. */ -static int caps_are_flushed(struct inode *inode, unsigned tid) +static int caps_are_flushed(struct inode *inode, u64 flush_tid) { struct ceph_inode_info *ci = ceph_inode(inode); - int i, ret = 1; + struct ceph_cap_flush *cf; + struct rb_node *n; + int ret = 1; spin_lock(&ci->i_ceph_lock); - for (i = 0; i < CEPH_CAP_BITS; i++) - if ((ci->i_flushing_caps & (1 << i)) && - ci->i_cap_flush_tid[i] <= tid) { - /* still flushing this bit */ + n = rb_first(&ci->i_cap_flush_tree); + if (n) { + cf = rb_entry(n, struct ceph_cap_flush, i_node); + if (cf->tid <= flush_tid) ret = 0; - break; - } + } spin_unlock(&ci->i_ceph_lock); return ret; } @@ -1864,13 +1936,16 @@ static void sync_write_wait(struct inode *inode) struct ceph_osd_request *req; u64 last_tid; + if (!S_ISREG(inode->i_mode)) + return; + spin_lock(&ci->i_unsafe_lock); if (list_empty(head)) goto out; /* set upper bound as _last_ entry in chain */ - req = list_entry(head->prev, struct ceph_osd_request, - r_unsafe_item); + req = list_last_entry(head, struct ceph_osd_request, + r_unsafe_item); last_tid = req->r_tid; do { @@ -1888,18 +1963,64 @@ static void sync_write_wait(struct inode *inode) */ if (list_empty(head)) break; - req = list_entry(head->next, struct ceph_osd_request, - r_unsafe_item); + req = list_first_entry(head, struct ceph_osd_request, + r_unsafe_item); } while (req->r_tid < last_tid); out: spin_unlock(&ci->i_unsafe_lock); } +/* + * wait for any uncommitted directory operations to commit. + */ +static int unsafe_dirop_wait(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct list_head *head = &ci->i_unsafe_dirops; + struct ceph_mds_request *req; + u64 last_tid; + int ret = 0; + + if (!S_ISDIR(inode->i_mode)) + return 0; + + spin_lock(&ci->i_unsafe_lock); + if (list_empty(head)) + goto out; + + req = list_last_entry(head, struct ceph_mds_request, + r_unsafe_dir_item); + last_tid = req->r_tid; + + do { + ceph_mdsc_get_request(req); + spin_unlock(&ci->i_unsafe_lock); + + dout("unsafe_dirop_wait %p wait on tid %llu (until %llu)\n", + inode, req->r_tid, last_tid); + ret = !wait_for_completion_timeout(&req->r_safe_completion, + ceph_timeout_jiffies(req->r_timeout)); + if (ret) + ret = -EIO; /* timed out */ + + ceph_mdsc_put_request(req); + + spin_lock(&ci->i_unsafe_lock); + if (ret || list_empty(head)) + break; + req = list_first_entry(head, struct ceph_mds_request, + r_unsafe_dir_item); + } while (req->r_tid < last_tid); +out: + spin_unlock(&ci->i_unsafe_lock); + return ret; +} + int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); - unsigned flush_tid; + u64 flush_tid; int ret; int dirty; @@ -1908,25 +2029,30 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret < 0) - return ret; + goto out; + + if (datasync) + goto out; + mutex_lock(&inode->i_mutex); dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); + ret = unsafe_dirop_wait(inode); + /* * only wait on non-file metadata writeback (the mds * can recover size and mtime, so we don't need to * wait for that) */ - if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { - dout("fsync waiting for flush_tid %u\n", flush_tid); + if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { ret = wait_event_interruptible(ci->i_cap_wq, - caps_are_flushed(inode, flush_tid)); + caps_are_flushed(inode, flush_tid)); } - - dout("fsync %p%s done\n", inode, datasync ? " datasync" : ""); mutex_unlock(&inode->i_mutex); +out: + dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); return ret; } @@ -1939,7 +2065,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) { struct ceph_inode_info *ci = ceph_inode(inode); - unsigned flush_tid; + u64 flush_tid; int err = 0; int dirty; int wait = wbc->sync_mode == WB_SYNC_ALL; @@ -1994,6 +2120,93 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, } } +static int __kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_inode_info *ci) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_cap *cap; + struct ceph_cap_flush *cf; + struct rb_node *n; + int delayed = 0; + u64 first_tid = 0; + u64 oldest_flush_tid; + + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); + + while (true) { + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + if (!(cap && cap->session == session)) { + pr_err("%p auth cap %p not mds%d ???\n", inode, + cap, session->s_mds); + spin_unlock(&ci->i_ceph_lock); + break; + } + + for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) { + cf = rb_entry(n, struct ceph_cap_flush, i_node); + if (cf->tid >= first_tid) + break; + } + if (!n) { + spin_unlock(&ci->i_ceph_lock); + break; + } + + cf = rb_entry(n, struct ceph_cap_flush, i_node); + + first_tid = cf->tid + 1; + + dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode, + cap, cf->tid, ceph_cap_string(cf->caps)); + delayed |= __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, + __ceph_caps_used(ci), + __ceph_caps_wanted(ci), + cap->issued | cap->implemented, + cf->caps, cf->tid, oldest_flush_tid); + } + return delayed; +} + +void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_inode_info *ci; + struct ceph_cap *cap; + + dout("early_kick_flushing_caps mds%d\n", session->s_mds); + list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + if (!(cap && cap->session == session)) { + pr_err("%p auth cap %p not mds%d ???\n", + &ci->vfs_inode, cap, session->s_mds); + spin_unlock(&ci->i_ceph_lock); + continue; + } + + + /* + * if flushing caps were revoked, we re-send the cap flush + * in client reconnect stage. This guarantees MDS * processes + * the cap flush message before issuing the flushing caps to + * other client. + */ + if ((cap->issued & ci->i_flushing_caps) != + ci->i_flushing_caps) { + spin_unlock(&ci->i_ceph_lock); + if (!__kick_flushing_caps(mdsc, session, ci)) + continue; + spin_lock(&ci->i_ceph_lock); + } + + spin_unlock(&ci->i_ceph_lock); + } +} + void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { @@ -2003,28 +2216,10 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, dout("kick_flushing_caps mds%d\n", session->s_mds); list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { - struct inode *inode = &ci->vfs_inode; - struct ceph_cap *cap; - int delayed = 0; - - spin_lock(&ci->i_ceph_lock); - cap = ci->i_auth_cap; - if (cap && cap->session == session) { - dout("kick_flushing_caps %p cap %p %s\n", inode, - cap, ceph_cap_string(ci->i_flushing_caps)); - delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, - __ceph_caps_used(ci), - __ceph_caps_wanted(ci), - cap->issued | cap->implemented, - ci->i_flushing_caps, NULL); - if (delayed) { - spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci); - spin_unlock(&ci->i_ceph_lock); - } - } else { - pr_err("%p auth cap %p not mds%d ???\n", inode, - cap, session->s_mds); + int delayed = __kick_flushing_caps(mdsc, session, ci); + if (delayed) { + spin_lock(&ci->i_ceph_lock); + __cap_delay_requeue(mdsc, ci); spin_unlock(&ci->i_ceph_lock); } } @@ -2036,26 +2231,25 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap *cap; - int delayed = 0; spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; - dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, - ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); + dout("kick_flushing_inode_caps %p flushing %s\n", inode, + ceph_cap_string(ci->i_flushing_caps)); __ceph_flush_snaps(ci, &session, 1); if (ci->i_flushing_caps) { + int delayed; + spin_lock(&mdsc->cap_dirty_lock); list_move_tail(&ci->i_flushing_item, &cap->session->s_cap_flushing); spin_unlock(&mdsc->cap_dirty_lock); - delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, - __ceph_caps_used(ci), - __ceph_caps_wanted(ci), - cap->issued | cap->implemented, - ci->i_flushing_caps, NULL); + spin_unlock(&ci->i_ceph_lock); + + delayed = __kick_flushing_caps(mdsc, session, ci); if (delayed) { spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(mdsc, ci); @@ -2073,7 +2267,8 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, * * Protected by i_ceph_lock. */ -static void __take_cap_refs(struct ceph_inode_info *ci, int got) +static void __take_cap_refs(struct ceph_inode_info *ci, int got, + bool snap_rwsem_locked) { if (got & CEPH_CAP_PIN) ci->i_pin_ref++; @@ -2081,8 +2276,14 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) ci->i_rd_ref++; if (got & CEPH_CAP_FILE_CACHE) ci->i_rdcache_ref++; - if (got & CEPH_CAP_FILE_WR) + if (got & CEPH_CAP_FILE_WR) { + if (ci->i_wr_ref == 0 && !ci->i_head_snapc) { + BUG_ON(!snap_rwsem_locked); + ci->i_head_snapc = ceph_get_snap_context( + ci->i_snap_realm->cached_context); + } ci->i_wr_ref++; + } if (got & CEPH_CAP_FILE_BUFFER) { if (ci->i_wb_ref == 0) ihold(&ci->vfs_inode); @@ -2100,16 +2301,19 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) * requested from the MDS. */ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, - loff_t endoff, int *got, int *check_max, int *err) + loff_t endoff, bool nonblock, int *got, int *err) { struct inode *inode = &ci->vfs_inode; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; int ret = 0; int have, implemented; int file_wanted; + bool snap_rwsem_locked = false; dout("get_cap_refs %p need %s want %s\n", inode, ceph_cap_string(need), ceph_cap_string(want)); +again: spin_lock(&ci->i_ceph_lock); /* make sure file is actually open */ @@ -2125,6 +2329,10 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, /* finish pending truncate */ while (ci->i_truncate_pending) { spin_unlock(&ci->i_ceph_lock); + if (snap_rwsem_locked) { + up_read(&mdsc->snap_rwsem); + snap_rwsem_locked = false; + } __ceph_do_pending_vmtruncate(inode); spin_lock(&ci->i_ceph_lock); } @@ -2136,7 +2344,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, dout("get_cap_refs %p endoff %llu > maxsize %llu\n", inode, endoff, ci->i_max_size); if (endoff > ci->i_requested_max_size) { - *check_max = 1; + *err = -EAGAIN; ret = 1; } goto out_unlock; @@ -2164,8 +2372,29 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, inode, ceph_cap_string(have), ceph_cap_string(not), ceph_cap_string(revoking)); if ((revoking & not) == 0) { + if (!snap_rwsem_locked && + !ci->i_head_snapc && + (need & CEPH_CAP_FILE_WR)) { + if (!down_read_trylock(&mdsc->snap_rwsem)) { + /* + * we can not call down_read() when + * task isn't in TASK_RUNNING state + */ + if (nonblock) { + *err = -EAGAIN; + ret = 1; + goto out_unlock; + } + + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + snap_rwsem_locked = true; + goto again; + } + snap_rwsem_locked = true; + } *got = need | (have & want); - __take_cap_refs(ci, *got); + __take_cap_refs(ci, *got, true); ret = 1; } } else { @@ -2189,6 +2418,8 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, } out_unlock: spin_unlock(&ci->i_ceph_lock); + if (snap_rwsem_locked) + up_read(&mdsc->snap_rwsem); dout("get_cap_refs %p ret %d got %s\n", inode, ret, ceph_cap_string(*got)); @@ -2231,50 +2462,70 @@ static void check_max_size(struct inode *inode, loff_t endoff) int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, loff_t endoff, int *got, struct page **pinned_page) { - int _got, check_max, ret, err = 0; + int _got, ret, err = 0; -retry: - if (endoff > 0) - check_max_size(&ci->vfs_inode, endoff); - _got = 0; - check_max = 0; - ret = wait_event_interruptible(ci->i_cap_wq, - try_get_cap_refs(ci, need, want, endoff, - &_got, &check_max, &err)); - if (err) - ret = err; + ret = ceph_pool_perm_check(ci, need); if (ret < 0) return ret; - if (check_max) - goto retry; + while (true) { + if (endoff > 0) + check_max_size(&ci->vfs_inode, endoff); - if (ci->i_inline_version != CEPH_INLINE_NONE && - (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && - i_size_read(&ci->vfs_inode) > 0) { - struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0); - if (page) { - if (PageUptodate(page)) { - *pinned_page = page; - goto out; - } - page_cache_release(page); - } - /* - * drop cap refs first because getattr while holding - * caps refs can cause deadlock. - */ - ceph_put_cap_refs(ci, _got); + err = 0; _got = 0; + ret = try_get_cap_refs(ci, need, want, endoff, + false, &_got, &err); + if (ret) { + if (err == -EAGAIN) + continue; + if (err < 0) + return err; + } else { + ret = wait_event_interruptible(ci->i_cap_wq, + try_get_cap_refs(ci, need, want, endoff, + true, &_got, &err)); + if (err == -EAGAIN) + continue; + if (err < 0) + ret = err; + if (ret < 0) + return ret; + } - /* getattr request will bring inline data into page cache */ - ret = __ceph_do_getattr(&ci->vfs_inode, NULL, - CEPH_STAT_CAP_INLINE_DATA, true); - if (ret < 0) - return ret; - goto retry; + if (ci->i_inline_version != CEPH_INLINE_NONE && + (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + i_size_read(&ci->vfs_inode) > 0) { + struct page *page = + find_get_page(ci->vfs_inode.i_mapping, 0); + if (page) { + if (PageUptodate(page)) { + *pinned_page = page; + break; + } + page_cache_release(page); + } + /* + * drop cap refs first because getattr while + * holding * caps refs can cause deadlock. + */ + ceph_put_cap_refs(ci, _got); + _got = 0; + + /* + * getattr request will bring inline data into + * page cache + */ + ret = __ceph_do_getattr(&ci->vfs_inode, NULL, + CEPH_STAT_CAP_INLINE_DATA, + true); + if (ret < 0) + return ret; + continue; + } + break; } -out: + *got = _got; return 0; } @@ -2286,10 +2537,31 @@ out: void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) { spin_lock(&ci->i_ceph_lock); - __take_cap_refs(ci, caps); + __take_cap_refs(ci, caps, false); spin_unlock(&ci->i_ceph_lock); } + +/* + * drop cap_snap that is not associated with any snapshot. + * we don't need to send FLUSHSNAP message for it. + */ +static int ceph_try_drop_cap_snap(struct ceph_cap_snap *capsnap) +{ + if (!capsnap->need_flush && + !capsnap->writing && !capsnap->dirty_pages) { + + dout("dropping cap_snap %p follows %llu\n", + capsnap, capsnap->follows); + ceph_put_snap_context(capsnap->context); + list_del(&capsnap->ci_item); + list_del(&capsnap->flushing_item); + ceph_put_cap_snap(capsnap); + return 1; + } + return 0; +} + /* * Release cap refs. * @@ -2303,7 +2575,6 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) { struct inode *inode = &ci->vfs_inode; int last = 0, put = 0, flushsnaps = 0, wake = 0; - struct ceph_cap_snap *capsnap; spin_lock(&ci->i_ceph_lock); if (had & CEPH_CAP_PIN) @@ -2325,17 +2596,24 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) if (had & CEPH_CAP_FILE_WR) if (--ci->i_wr_ref == 0) { last++; - if (!list_empty(&ci->i_cap_snaps)) { - capsnap = list_first_entry(&ci->i_cap_snaps, - struct ceph_cap_snap, - ci_item); - if (capsnap->writing) { - capsnap->writing = 0; - flushsnaps = - __ceph_finish_cap_snap(ci, - capsnap); - wake = 1; - } + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + capsnap->writing = 0; + if (ceph_try_drop_cap_snap(capsnap)) + put++; + else if (__ceph_finish_cap_snap(ci, capsnap)) + flushsnaps = 1; + wake = 1; + } + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + BUG_ON(!ci->i_head_snapc); + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; } /* see comment in __ceph_remove_cap() */ if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) @@ -2352,7 +2630,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) ceph_flush_snaps(ci); if (wake) wake_up_all(&ci->i_cap_wq); - if (put) + while (put-- > 0) iput(inode); } @@ -2380,7 +2658,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, if (ci->i_head_snapc == snapc) { ci->i_wrbuffer_ref_head -= nr; if (ci->i_wrbuffer_ref_head == 0 && - ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; @@ -2401,25 +2681,15 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, capsnap->dirty_pages -= nr; if (capsnap->dirty_pages == 0) { complete_capsnap = 1; - if (capsnap->dirty == 0) - /* cap writeback completed before we created - * the cap_snap; no FLUSHSNAP is needed */ - drop_capsnap = 1; + drop_capsnap = ceph_try_drop_cap_snap(capsnap); } dout("put_wrbuffer_cap_refs on %p cap_snap %p " - " snap %lld %d/%d -> %d/%d %s%s%s\n", + " snap %lld %d/%d -> %d/%d %s%s\n", inode, capsnap, capsnap->context->seq, ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, ci->i_wrbuffer_ref, capsnap->dirty_pages, last ? " (wrbuffer last)" : "", - complete_capsnap ? " (complete capsnap)" : "", - drop_capsnap ? " (drop capsnap)" : ""); - if (drop_capsnap) { - ceph_put_snap_context(capsnap->context); - list_del(&capsnap->ci_item); - list_del(&capsnap->flushing_item); - ceph_put_cap_snap(capsnap); - } + complete_capsnap ? " (complete capsnap)" : ""); } spin_unlock(&ci->i_ceph_lock); @@ -2526,7 +2796,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, * try to invalidate (once). (If there are dirty buffers, we * will invalidate _after_ writeback.) */ - if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && + if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */ + ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && !ci->i_wrbuffer_ref) { if (try_nonblocking_invalidate(inode)) { @@ -2732,16 +3003,29 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_cap_flush *cf; + struct rb_node *n; + LIST_HEAD(to_remove); unsigned seq = le32_to_cpu(m->seq); int dirty = le32_to_cpu(m->dirty); int cleaned = 0; int drop = 0; - int i; - for (i = 0; i < CEPH_CAP_BITS; i++) - if ((dirty & (1 << i)) && - (u16)flush_tid == ci->i_cap_flush_tid[i]) - cleaned |= 1 << i; + n = rb_first(&ci->i_cap_flush_tree); + while (n) { + cf = rb_entry(n, struct ceph_cap_flush, i_node); + n = rb_next(&cf->i_node); + if (cf->tid == flush_tid) + cleaned = cf->caps; + if (cf->tid <= flush_tid) { + rb_erase(&cf->i_node, &ci->i_cap_flush_tree); + list_add_tail(&cf->list, &to_remove); + } else { + cleaned &= ~cf->caps; + if (!cleaned) + break; + } + } dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s," " flushing %s -> %s\n", @@ -2749,12 +3033,23 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(ci->i_flushing_caps & ~cleaned)); - if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned)) + if (list_empty(&to_remove) && !cleaned) goto out; ci->i_flushing_caps &= ~cleaned; spin_lock(&mdsc->cap_dirty_lock); + + if (!list_empty(&to_remove)) { + list_for_each_entry(cf, &to_remove, list) + rb_erase(&cf->g_node, &mdsc->cap_flush_tree); + + n = rb_first(&mdsc->cap_flush_tree); + cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL; + if (!cf || cf->tid > flush_tid) + wake_up_all(&mdsc->cap_flushing_wq); + } + if (ci->i_flushing_caps == 0) { list_del_init(&ci->i_flushing_item); if (!list_empty(&session->s_cap_flushing)) @@ -2764,14 +3059,14 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, struct ceph_inode_info, i_flushing_item)->vfs_inode); mdsc->num_cap_flushing--; - wake_up_all(&mdsc->cap_flushing_wq); dout(" inode %p now !flushing\n", inode); if (ci->i_dirty_caps == 0) { dout(" inode %p now clean\n", inode); BUG_ON(!list_empty(&ci->i_dirty_item)); drop = 1; - if (ci->i_wrbuffer_ref_head == 0) { + if (ci->i_wr_ref == 0 && + ci->i_wrbuffer_ref_head == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; @@ -2785,6 +3080,13 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, out: spin_unlock(&ci->i_ceph_lock); + + while (!list_empty(&to_remove)) { + cf = list_first_entry(&to_remove, + struct ceph_cap_flush, list); + list_del(&cf->list); + ceph_free_cap_flush(cf); + } if (drop) iput(inode); } @@ -2800,6 +3102,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_session *session) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; u64 follows = le64_to_cpu(m->snap_follows); struct ceph_cap_snap *capsnap; int drop = 0; @@ -2823,6 +3126,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, list_del(&capsnap->ci_item); list_del(&capsnap->flushing_item); ceph_put_cap_snap(capsnap); + wake_up_all(&mdsc->cap_flushing_wq); drop = 1; break; } else { @@ -2971,7 +3275,6 @@ retry: mutex_lock_nested(&session->s_mutex, SINGLE_DEPTH_NESTING); } - ceph_add_cap_releases(mdsc, tsession); new_cap = ceph_get_cap(mdsc, NULL); } else { WARN_ON(1); @@ -3167,16 +3470,20 @@ void ceph_handle_caps(struct ceph_mds_session *session, dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); - if (op == CEPH_CAP_OP_IMPORT) - ceph_add_cap_releases(mdsc, session); - if (!inode) { dout(" i don't have ino %llx\n", vino.ino); if (op == CEPH_CAP_OP_IMPORT) { + cap = ceph_get_cap(mdsc, NULL); + cap->cap_ino = vino.ino; + cap->queue_release = 1; + cap->cap_id = cap_id; + cap->mseq = mseq; + cap->seq = seq; spin_lock(&session->s_cap_lock); - __queue_cap_release(session, vino.ino, cap_id, - mseq, seq); + list_add_tail(&cap->session_caps, + &session->s_cap_releases); + session->s_num_cap_releases++; spin_unlock(&session->s_cap_lock); } goto flush_cap_releases; @@ -3252,11 +3559,10 @@ void ceph_handle_caps(struct ceph_mds_session *session, flush_cap_releases: /* - * send any full release message to try to move things + * send any cap release message to try to move things * along for the mds (who clearly thinks we still have this * cap). */ - ceph_add_cap_releases(mdsc, session); ceph_send_cap_releases(mdsc, session); done: diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 4248307fea90..9314b4ea2375 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -38,7 +38,7 @@ int ceph_init_dentry(struct dentry *dentry) if (dentry->d_fsdata) return 0; - di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); + di = kmem_cache_alloc(ceph_dentry_cachep, GFP_KERNEL | __GFP_ZERO); if (!di) return -ENOMEM; /* oh well */ @@ -107,6 +107,27 @@ static int fpos_cmp(loff_t l, loff_t r) } /* + * make note of the last dentry we read, so we can + * continue at the same lexicographical point, + * regardless of what dir changes take place on the + * server. + */ +static int note_last_dentry(struct ceph_file_info *fi, const char *name, + int len, unsigned next_offset) +{ + char *buf = kmalloc(len+1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + kfree(fi->last_name); + fi->last_name = buf; + memcpy(fi->last_name, name, len); + fi->last_name[len] = 0; + fi->next_offset = next_offset; + dout("note_last_dentry '%s'\n", fi->last_name); + return 0; +} + +/* * When possible, we try to satisfy a readdir by peeking at the * dcache. We make this work by carefully ordering dentries on * d_child when we initially get results back from the MDS, and @@ -123,123 +144,113 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, struct ceph_file_info *fi = file->private_data; struct dentry *parent = file->f_path.dentry; struct inode *dir = d_inode(parent); - struct list_head *p; - struct dentry *dentry, *last; + struct dentry *dentry, *last = NULL; struct ceph_dentry_info *di; + unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry *); int err = 0; + loff_t ptr_pos = 0; + struct ceph_readdir_cache_control cache_ctl = {}; - /* claim ref on last dentry we returned */ - last = fi->dentry; - fi->dentry = NULL; - - dout("__dcache_readdir %p v%u at %llu (last %p)\n", - dir, shared_gen, ctx->pos, last); + dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos); - spin_lock(&parent->d_lock); - - /* start at beginning? */ - if (ctx->pos == 2 || last == NULL || - fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) { - if (list_empty(&parent->d_subdirs)) - goto out_unlock; - p = parent->d_subdirs.prev; - dout(" initial p %p/%p\n", p->prev, p->next); - } else { - p = last->d_child.prev; + /* we can calculate cache index for the first dirfrag */ + if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) { + cache_ctl.index = fpos_off(ctx->pos) - 2; + BUG_ON(cache_ctl.index < 0); + ptr_pos = cache_ctl.index * sizeof(struct dentry *); } -more: - dentry = list_entry(p, struct dentry, d_child); - di = ceph_dentry(dentry); - while (1) { - dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next, - d_unhashed(dentry) ? "!hashed" : "hashed", - parent->d_subdirs.prev, parent->d_subdirs.next); - if (p == &parent->d_subdirs) { + while (true) { + pgoff_t pgoff; + bool emit_dentry; + + if (ptr_pos >= i_size_read(dir)) { fi->flags |= CEPH_F_ATEND; - goto out_unlock; + err = 0; + break; + } + + err = -EAGAIN; + pgoff = ptr_pos >> PAGE_CACHE_SHIFT; + if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) { + ceph_readdir_cache_release(&cache_ctl); + cache_ctl.page = find_lock_page(&dir->i_data, pgoff); + if (!cache_ctl.page) { + dout(" page %lu not found\n", pgoff); + break; + } + /* reading/filling the cache are serialized by + * i_mutex, no need to use page lock */ + unlock_page(cache_ctl.page); + cache_ctl.dentries = kmap(cache_ctl.page); } - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + + rcu_read_lock(); + spin_lock(&parent->d_lock); + /* check i_size again here, because empty directory can be + * marked as complete while not holding the i_mutex. */ + if (ceph_dir_is_complete_ordered(dir) && + ptr_pos < i_size_read(dir)) + dentry = cache_ctl.dentries[cache_ctl.index % nsize]; + else + dentry = NULL; + spin_unlock(&parent->d_lock); + if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) + dentry = NULL; + rcu_read_unlock(); + if (!dentry) + break; + + emit_dentry = false; + di = ceph_dentry(dentry); + spin_lock(&dentry->d_lock); if (di->lease_shared_gen == shared_gen && - !d_unhashed(dentry) && d_really_is_positive(dentry) && + d_really_is_positive(dentry) && ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR && ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH && - fpos_cmp(ctx->pos, di->offset) <= 0) - break; - dout(" skipping %p %pd at %llu (%llu)%s%s\n", dentry, - dentry, di->offset, - ctx->pos, d_unhashed(dentry) ? " unhashed" : "", - !d_inode(dentry) ? " null" : ""); + fpos_cmp(ctx->pos, di->offset) <= 0) { + emit_dentry = true; + } spin_unlock(&dentry->d_lock); - p = p->prev; - dentry = list_entry(p, struct dentry, d_child); - di = ceph_dentry(dentry); - } - - dget_dlock(dentry); - spin_unlock(&dentry->d_lock); - spin_unlock(&parent->d_lock); - /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_dir_is_complete_ordered(dir)) { - dout(" lost dir complete on %p; falling back to mds\n", dir); - dput(dentry); - err = -EAGAIN; - goto out; - } + if (emit_dentry) { + dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, + dentry, dentry, d_inode(dentry)); + ctx->pos = di->offset; + if (!dir_emit(ctx, dentry->d_name.name, + dentry->d_name.len, + ceph_translate_ino(dentry->d_sb, + d_inode(dentry)->i_ino), + d_inode(dentry)->i_mode >> 12)) { + dput(dentry); + err = 0; + break; + } + ctx->pos++; - dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, - dentry, dentry, d_inode(dentry)); - if (!dir_emit(ctx, dentry->d_name.name, - dentry->d_name.len, - ceph_translate_ino(dentry->d_sb, d_inode(dentry)->i_ino), - d_inode(dentry)->i_mode >> 12)) { - if (last) { - /* remember our position */ - fi->dentry = last; - fi->next_offset = fpos_off(di->offset); + if (last) + dput(last); + last = dentry; + } else { + dput(dentry); } - dput(dentry); - return 0; - } - - ctx->pos = di->offset + 1; - if (last) - dput(last); - last = dentry; - - spin_lock(&parent->d_lock); - p = p->prev; /* advance to next dentry */ - goto more; - -out_unlock: - spin_unlock(&parent->d_lock); -out: - if (last) + cache_ctl.index++; + ptr_pos += sizeof(struct dentry *); + } + ceph_readdir_cache_release(&cache_ctl); + if (last) { + int ret; + di = ceph_dentry(last); + ret = note_last_dentry(fi, last->d_name.name, last->d_name.len, + fpos_off(di->offset) + 1); + if (ret < 0) + err = ret; dput(last); + } return err; } -/* - * make note of the last dentry we read, so we can - * continue at the same lexicographical point, - * regardless of what dir changes take place on the - * server. - */ -static int note_last_dentry(struct ceph_file_info *fi, const char *name, - int len) -{ - kfree(fi->last_name); - fi->last_name = kmalloc(len+1, GFP_NOFS); - if (!fi->last_name) - return -ENOMEM; - memcpy(fi->last_name, name, len); - fi->last_name[len] = 0; - dout("note_last_dentry '%s'\n", fi->last_name); - return 0; -} - static int ceph_readdir(struct file *file, struct dir_context *ctx) { struct ceph_file_info *fi = file->private_data; @@ -280,8 +291,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* can we use the dcache? */ spin_lock(&ci->i_ceph_lock); - if ((ctx->pos == 2 || fi->dentry) && - ceph_test_mount_opt(fsc, DCACHE) && + if (ceph_test_mount_opt(fsc, DCACHE) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && __ceph_dir_is_complete_ordered(ci) && @@ -296,24 +306,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) } else { spin_unlock(&ci->i_ceph_lock); } - if (fi->dentry) { - err = note_last_dentry(fi, fi->dentry->d_name.name, - fi->dentry->d_name.len); - if (err) - return err; - dput(fi->dentry); - fi->dentry = NULL; - } /* proceed with a normal readdir */ - - if (ctx->pos == 2) { - /* note dir version at start of readdir so we can tell - * if any dentries get dropped */ - fi->dir_release_count = atomic_read(&ci->i_release_count); - fi->dir_ordered_count = ci->i_ordered_count; - } - more: /* do we have the correct frag content buffered? */ if (fi->frag != frag || fi->last_readdir == NULL) { @@ -342,12 +336,15 @@ more: req->r_direct_hash = ceph_frag_value(frag); req->r_direct_is_hash = true; if (fi->last_name) { - req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); + req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); if (!req->r_path2) { ceph_mdsc_put_request(req); return -ENOMEM; } } + req->r_dir_release_cnt = fi->dir_release_count; + req->r_dir_ordered_cnt = fi->dir_ordered_count; + req->r_readdir_cache_idx = fi->readdir_cache_idx; req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); @@ -364,26 +361,38 @@ more: (int)req->r_reply_info.dir_end, (int)req->r_reply_info.dir_complete); - if (!req->r_did_prepopulate) { - dout("readdir !did_prepopulate"); - /* preclude from marking dir complete */ - fi->dir_release_count--; - } /* note next offset and last dentry name */ rinfo = &req->r_reply_info; if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { frag = le32_to_cpu(rinfo->dir_dir->frag); - if (ceph_frag_is_leftmost(frag)) - fi->next_offset = 2; - else - fi->next_offset = 0; - off = fi->next_offset; + off = req->r_readdir_offset; + fi->next_offset = off; } + fi->frag = frag; fi->offset = fi->next_offset; fi->last_readdir = req; + if (req->r_did_prepopulate) { + fi->readdir_cache_idx = req->r_readdir_cache_idx; + if (fi->readdir_cache_idx < 0) { + /* preclude from marking dir ordered */ + fi->dir_ordered_count = 0; + } else if (ceph_frag_is_leftmost(frag) && off == 2) { + /* note dir version at start of readdir so + * we can tell if any dentries get dropped */ + fi->dir_release_count = req->r_dir_release_cnt; + fi->dir_ordered_count = req->r_dir_ordered_cnt; + } + } else { + dout("readdir !did_prepopulate"); + /* disable readdir cache */ + fi->readdir_cache_idx = -1; + /* preclude from marking dir complete */ + fi->dir_release_count = 0; + } + if (req->r_reply_info.dir_end) { kfree(fi->last_name); fi->last_name = NULL; @@ -394,10 +403,10 @@ more: } else { err = note_last_dentry(fi, rinfo->dir_dname[rinfo->dir_nr-1], - rinfo->dir_dname_len[rinfo->dir_nr-1]); + rinfo->dir_dname_len[rinfo->dir_nr-1], + fi->next_offset + rinfo->dir_nr); if (err) return err; - fi->next_offset += rinfo->dir_nr; } } @@ -453,16 +462,22 @@ more: * were released during the whole readdir, and we should have * the complete dir contents in our cache. */ - spin_lock(&ci->i_ceph_lock); - if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { - if (ci->i_ordered_count == fi->dir_ordered_count) + if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) { + spin_lock(&ci->i_ceph_lock); + if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) { dout(" marking %p complete and ordered\n", inode); - else + /* use i_size to track number of entries in + * readdir cache */ + BUG_ON(fi->readdir_cache_idx < 0); + i_size_write(inode, fi->readdir_cache_idx * + sizeof(struct dentry*)); + } else { dout(" marking %p complete\n", inode); + } __ceph_dir_set_complete(ci, fi->dir_release_count, fi->dir_ordered_count); + spin_unlock(&ci->i_ceph_lock); } - spin_unlock(&ci->i_ceph_lock); dout("readdir %p file %p done.\n", inode, file); return 0; @@ -476,14 +491,12 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag) } kfree(fi->last_name); fi->last_name = NULL; + fi->dir_release_count = 0; + fi->readdir_cache_idx = -1; if (ceph_frag_is_leftmost(frag)) fi->next_offset = 2; /* compensate for . and .. */ else fi->next_offset = 0; - if (fi->dentry) { - dput(fi->dentry); - fi->dentry = NULL; - } fi->flags &= ~CEPH_F_ATEND; } @@ -497,13 +510,12 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) mutex_lock(&inode->i_mutex); retval = -EINVAL; switch (whence) { - case SEEK_END: - offset += inode->i_size + 2; /* FIXME */ - break; case SEEK_CUR: offset += file->f_pos; case SEEK_SET: break; + case SEEK_END: + retval = -EOPNOTSUPP; default: goto out; } @@ -516,20 +528,18 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) } retval = offset; - /* - * discard buffered readdir content on seekdir(0), or - * seek to new frag, or seek prior to current chunk. - */ if (offset == 0 || fpos_frag(offset) != fi->frag || fpos_off(offset) < fi->offset) { + /* discard buffered readdir content on seekdir(0), or + * seek to new frag, or seek prior to current chunk */ dout("dir_llseek dropping %p content\n", file); reset_readdir(fi, fpos_frag(offset)); + } else if (fpos_cmp(offset, old_offset) > 0) { + /* reset dir_release_count if we did a forward seek */ + fi->dir_release_count = 0; + fi->readdir_cache_idx = -1; } - - /* bump dir_release_count if we did a forward seek */ - if (fpos_cmp(offset, old_offset) > 0) - fi->dir_release_count--; } out: mutex_unlock(&inode->i_mutex); @@ -764,7 +774,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, err = PTR_ERR(req); goto out; } - req->r_path2 = kstrdup(dest, GFP_NOFS); + req->r_path2 = kstrdup(dest, GFP_KERNEL); if (!req->r_path2) { err = -ENOMEM; ceph_mdsc_put_request(req); @@ -985,16 +995,15 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, * to do it here. */ + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(old_dir); + ceph_dir_clear_complete(new_dir); + d_move(old_dentry, new_dentry); /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(new_dentry); - - /* d_move screws up sibling dentries' offsets */ - ceph_dir_clear_complete(old_dir); - ceph_dir_clear_complete(new_dir); - } ceph_mdsc_put_request(req); return err; @@ -1189,7 +1198,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, return -EISDIR; if (!cf->dir_info) { - cf->dir_info = kmalloc(bufsize, GFP_NOFS); + cf->dir_info = kmalloc(bufsize, GFP_KERNEL); if (!cf->dir_info) return -ENOMEM; cf->dir_info_len = @@ -1224,66 +1233,6 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, } /* - * an fsync() on a dir will wait for any uncommitted directory - * operations to commit. - */ -static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, - int datasync) -{ - struct inode *inode = file_inode(file); - struct ceph_inode_info *ci = ceph_inode(inode); - struct list_head *head = &ci->i_unsafe_dirops; - struct ceph_mds_request *req; - u64 last_tid; - int ret = 0; - - dout("dir_fsync %p\n", inode); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - return ret; - mutex_lock(&inode->i_mutex); - - spin_lock(&ci->i_unsafe_lock); - if (list_empty(head)) - goto out; - - req = list_entry(head->prev, - struct ceph_mds_request, r_unsafe_dir_item); - last_tid = req->r_tid; - - do { - ceph_mdsc_get_request(req); - spin_unlock(&ci->i_unsafe_lock); - - dout("dir_fsync %p wait on tid %llu (until %llu)\n", - inode, req->r_tid, last_tid); - if (req->r_timeout) { - unsigned long time_left = wait_for_completion_timeout( - &req->r_safe_completion, - req->r_timeout); - if (time_left > 0) - ret = 0; - else - ret = -EIO; /* timed out */ - } else { - wait_for_completion(&req->r_safe_completion); - } - ceph_mdsc_put_request(req); - - spin_lock(&ci->i_unsafe_lock); - if (ret || list_empty(head)) - break; - req = list_entry(head->next, - struct ceph_mds_request, r_unsafe_dir_item); - } while (req->r_tid < last_tid); -out: - spin_unlock(&ci->i_unsafe_lock); - mutex_unlock(&inode->i_mutex); - - return ret; -} - -/* * We maintain a private dentry LRU. * * FIXME: this needs to be changed to a per-mds lru to be useful. @@ -1353,7 +1302,7 @@ const struct file_operations ceph_dir_fops = { .open = ceph_open, .release = ceph_release, .unlocked_ioctl = ceph_ioctl, - .fsync = ceph_dir_fsync, + .fsync = ceph_fsync, }; const struct file_operations ceph_snapdir_fops = { diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3b6b522b4b31..8b79d87eaf46 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -89,13 +89,14 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) case S_IFDIR: dout("init_file %p %p 0%o (regular)\n", inode, file, inode->i_mode); - cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO); + cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO); if (cf == NULL) { ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ return -ENOMEM; } cf->fmode = fmode; cf->next_offset = 2; + cf->readdir_cache_idx = -1; file->private_data = cf; BUG_ON(inode->i_fop->release != ceph_release); break; @@ -324,7 +325,6 @@ int ceph_release(struct inode *inode, struct file *file) ceph_mdsc_put_request(cf->last_readdir); kfree(cf->last_name); kfree(cf->dir_info); - dput(cf->dentry); kmem_cache_free(ceph_file_cachep, cf); /* wake up anyone waiting for caps on this inode */ @@ -483,7 +483,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, } } else { num_pages = calc_pages_for(off, len); - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) return PTR_ERR(pages); ret = striped_read(inode, off, len, pages, @@ -557,13 +557,13 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) * objects, rollback on failure, etc.) */ static ssize_t -ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) +ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, + struct ceph_snap_context *snapc) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_snap_context *snapc; struct ceph_vino vino; struct ceph_osd_request *req; struct page **pages; @@ -600,7 +600,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) size_t start; ssize_t n; - snapc = ci->i_snap_realm->cached_context; vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, pos, &len, 0, @@ -614,7 +613,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) break; } - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); n = iov_iter_get_pages_alloc(from, &pages, len, &start); if (unlikely(n < 0)) { @@ -674,13 +673,13 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) * objects, rollback on failure, etc.) */ static ssize_t -ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) +ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, + struct ceph_snap_context *snapc) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_snap_context *snapc; struct ceph_vino vino; struct ceph_osd_request *req; struct page **pages; @@ -717,7 +716,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) size_t left; int n; - snapc = ci->i_snap_realm->cached_context; vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, pos, &len, 0, 1, @@ -736,7 +734,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) */ num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; @@ -860,7 +858,7 @@ again: struct page *page = NULL; loff_t i_size; if (retry_op == READ_INLINE) { - page = __page_cache_alloc(GFP_NOFS); + page = __page_cache_alloc(GFP_KERNEL); if (!page) return -ENOMEM; } @@ -941,6 +939,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; + struct ceph_cap_flush *prealloc_cf; ssize_t count, written = 0; int err, want, got; loff_t pos; @@ -948,6 +947,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + mutex_lock(&inode->i_mutex); /* We can write back this queue in page reclaim */ @@ -959,7 +962,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) pos = iocb->ki_pos; count = iov_iter_count(from); - err = file_remove_suid(file); + err = file_remove_privs(file); if (err) goto out; @@ -996,14 +999,30 @@ retry_snap: if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) { + struct ceph_snap_context *snapc; struct iov_iter data; mutex_unlock(&inode->i_mutex); + + spin_lock(&ci->i_ceph_lock); + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + snapc = ceph_get_snap_context(capsnap->context); + } else { + BUG_ON(!ci->i_head_snapc); + snapc = ceph_get_snap_context(ci->i_head_snapc); + } + spin_unlock(&ci->i_ceph_lock); + /* we might need to revert back to that point */ data = *from; if (iocb->ki_flags & IOCB_DIRECT) - written = ceph_sync_direct_write(iocb, &data, pos); + written = ceph_sync_direct_write(iocb, &data, pos, + snapc); else - written = ceph_sync_write(iocb, &data, pos); + written = ceph_sync_write(iocb, &data, pos, snapc); if (written == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n", @@ -1014,6 +1033,7 @@ retry_snap: } if (written > 0) iov_iter_advance(from, written); + ceph_put_snap_context(snapc); } else { loff_t old_size = inode->i_size; /* @@ -1035,7 +1055,8 @@ retry_snap: int dirty; spin_lock(&ci->i_ceph_lock); ci->i_inline_version = CEPH_INLINE_NONE; - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &prealloc_cf); spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); @@ -1059,6 +1080,7 @@ retry_snap: out: mutex_unlock(&inode->i_mutex); out_unlocked: + ceph_free_cap_flush(prealloc_cf); current->backing_dev_info = NULL; return written ? written : err; } @@ -1255,6 +1277,7 @@ static long ceph_fallocate(struct file *file, int mode, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; + struct ceph_cap_flush *prealloc_cf; int want, got = 0; int dirty; int ret = 0; @@ -1267,6 +1290,10 @@ static long ceph_fallocate(struct file *file, int mode, if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + mutex_lock(&inode->i_mutex); if (ceph_snap(inode) != CEPH_NOSNAP) { @@ -1313,7 +1340,8 @@ static long ceph_fallocate(struct file *file, int mode, if (!ret) { spin_lock(&ci->i_ceph_lock); ci->i_inline_version = CEPH_INLINE_NONE; - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &prealloc_cf); spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); @@ -1322,6 +1350,7 @@ static long ceph_fallocate(struct file *file, int mode, ceph_put_cap_refs(ci, got); unlock: mutex_unlock(&inode->i_mutex); + ceph_free_cap_flush(prealloc_cf); return ret; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 571acd88606c..96d2bd829902 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -389,9 +389,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_inline_version = 0; ci->i_time_warp_seq = 0; ci->i_ceph_flags = 0; - ci->i_ordered_count = 0; - atomic_set(&ci->i_release_count, 1); - atomic_set(&ci->i_complete_count, 0); + atomic64_set(&ci->i_ordered_count, 1); + atomic64_set(&ci->i_release_count, 1); + atomic64_set(&ci->i_complete_seq[0], 0); + atomic64_set(&ci->i_complete_seq[1], 0); ci->i_symlink = NULL; memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); @@ -415,9 +416,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_flushing_caps = 0; INIT_LIST_HEAD(&ci->i_dirty_item); INIT_LIST_HEAD(&ci->i_flushing_item); - ci->i_cap_flush_seq = 0; - ci->i_cap_flush_last_tid = 0; - memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid)); + ci->i_prealloc_cap_flush = NULL; + ci->i_cap_flush_tree = RB_ROOT; init_waitqueue_head(&ci->i_cap_wq); ci->i_hold_caps_min = 0; ci->i_hold_caps_max = 0; @@ -752,7 +752,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page, if (new_version || (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { + if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool) + ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; ci->i_layout = info->layout; + queue_trunc = ceph_fill_file_size(inode, issued, le32_to_cpu(info->truncate_seq), le64_to_cpu(info->truncate_size), @@ -858,9 +861,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page, (issued & CEPH_CAP_FILE_EXCL) == 0 && !__ceph_dir_is_complete(ci)) { dout(" marking %p complete (empty)\n", inode); + i_size_write(inode, 0); __ceph_dir_set_complete(ci, - atomic_read(&ci->i_release_count), - ci->i_ordered_count); + atomic64_read(&ci->i_release_count), + atomic64_read(&ci->i_ordered_count)); } wake = true; @@ -1212,6 +1216,10 @@ retry_lookup: dout("fill_trace doing d_move %p -> %p\n", req->r_old_dentry, dn); + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_ordered(dir); + ceph_dir_clear_ordered(olddir); + d_move(req->r_old_dentry, dn); dout(" src %p '%pd' dst %p '%pd'\n", req->r_old_dentry, @@ -1222,10 +1230,6 @@ retry_lookup: rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(dn); - /* d_move screws up sibling dentries' offsets */ - ceph_dir_clear_ordered(dir); - ceph_dir_clear_ordered(olddir); - dout("dn %p gets new offset %lld\n", req->r_old_dentry, ceph_dentry(req->r_old_dentry)->offset); @@ -1333,6 +1337,49 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, return err; } +void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl) +{ + if (ctl->page) { + kunmap(ctl->page); + page_cache_release(ctl->page); + ctl->page = NULL; + } +} + +static int fill_readdir_cache(struct inode *dir, struct dentry *dn, + struct ceph_readdir_cache_control *ctl, + struct ceph_mds_request *req) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry*); + unsigned idx = ctl->index % nsize; + pgoff_t pgoff = ctl->index / nsize; + + if (!ctl->page || pgoff != page_index(ctl->page)) { + ceph_readdir_cache_release(ctl); + ctl->page = grab_cache_page(&dir->i_data, pgoff); + if (!ctl->page) { + ctl->index = -1; + return -ENOMEM; + } + /* reading/filling the cache are serialized by + * i_mutex, no need to use page lock */ + unlock_page(ctl->page); + ctl->dentries = kmap(ctl->page); + } + + if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) && + req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) { + dout("readdir cache dn %p idx %d\n", dn, ctl->index); + ctl->dentries[idx] = dn; + ctl->index++; + } else { + dout("disable readdir cache\n"); + ctl->index = -1; + } + return 0; +} + int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session) { @@ -1345,8 +1392,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; struct ceph_dentry_info *di; - u64 r_readdir_offset = req->r_readdir_offset; u32 frag = le32_to_cpu(rhead->args.readdir.frag); + struct ceph_readdir_cache_control cache_ctl = {}; + + if (req->r_aborted) + return readdir_prepopulate_inodes_only(req, session); if (rinfo->dir_dir && le32_to_cpu(rinfo->dir_dir->frag) != frag) { @@ -1354,14 +1404,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, frag, le32_to_cpu(rinfo->dir_dir->frag)); frag = le32_to_cpu(rinfo->dir_dir->frag); if (ceph_frag_is_leftmost(frag)) - r_readdir_offset = 2; + req->r_readdir_offset = 2; else - r_readdir_offset = 0; + req->r_readdir_offset = 0; } - if (req->r_aborted) - return readdir_prepopulate_inodes_only(req, session); - if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { snapdir = ceph_get_snapdir(d_inode(parent)); parent = d_find_alias(snapdir); @@ -1374,6 +1421,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); } + if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { + /* note dir version at start of readdir so we can tell + * if any dentries get dropped */ + struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); + req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); + req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); + req->r_readdir_cache_idx = 0; + } + + cache_ctl.index = req->r_readdir_cache_idx; + /* FIXME: release caps/leases if error occurs */ for (i = 0; i < rinfo->dir_nr; i++) { struct ceph_vino vino; @@ -1413,13 +1471,6 @@ retry_lookup: d_delete(dn); dput(dn); goto retry_lookup; - } else { - /* reorder parent's d_subdirs */ - spin_lock(&parent->d_lock); - spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); - list_move(&dn->d_child, &parent->d_subdirs); - spin_unlock(&dn->d_lock); - spin_unlock(&parent->d_lock); } /* inode */ @@ -1436,13 +1487,15 @@ retry_lookup: } } - if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, - req->r_request_started, -1, - &req->r_caps_reservation) < 0) { + ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, + req->r_request_started, -1, + &req->r_caps_reservation); + if (ret < 0) { pr_err("fill_inode badness on %p\n", in); if (d_really_is_negative(dn)) iput(in); d_drop(dn); + err = ret; goto next_item; } @@ -1458,19 +1511,28 @@ retry_lookup: } di = dn->d_fsdata; - di->offset = ceph_make_fpos(frag, i + r_readdir_offset); + di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); update_dentry_lease(dn, rinfo->dir_dlease[i], req->r_session, req->r_request_started); + + if (err == 0 && cache_ctl.index >= 0) { + ret = fill_readdir_cache(d_inode(parent), dn, + &cache_ctl, req); + if (ret < 0) + err = ret; + } next_item: if (dn) dput(dn); } - if (err == 0) - req->r_did_prepopulate = true; - out: + if (err == 0) { + req->r_did_prepopulate = true; + req->r_readdir_cache_idx = cache_ctl.index; + } + ceph_readdir_cache_release(&cache_ctl); if (snapdir) { iput(snapdir); dput(parent); @@ -1712,11 +1774,13 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; + struct ceph_cap_flush *prealloc_cf; int issued; int release = 0, dirtied = 0; int mask = 0; int err = 0; int inode_dirty_flags = 0; + bool lock_snap_rwsem = false; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; @@ -1725,13 +1789,31 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (err != 0) return err; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR, USE_AUTH_MDS); - if (IS_ERR(req)) + if (IS_ERR(req)) { + ceph_free_cap_flush(prealloc_cf); return PTR_ERR(req); + } spin_lock(&ci->i_ceph_lock); issued = __ceph_caps_issued(ci, NULL); + + if (!ci->i_head_snapc && + (issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) { + lock_snap_rwsem = true; + if (!down_read_trylock(&mdsc->snap_rwsem)) { + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + issued = __ceph_caps_issued(ci, NULL); + } + } + dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); if (ia_valid & ATTR_UID) { @@ -1874,12 +1956,15 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) dout("setattr %p ATTR_FILE ... hrm!\n", inode); if (dirtied) { - inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied); + inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, + &prealloc_cf); inode->i_ctime = CURRENT_TIME; } release &= issued; spin_unlock(&ci->i_ceph_lock); + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); @@ -1904,9 +1989,11 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ceph_mdsc_put_request(req); if (mask & CEPH_SETATTR_SIZE) __ceph_do_pending_vmtruncate(inode); + ceph_free_cap_flush(prealloc_cf); return err; out_put: ceph_mdsc_put_request(req); + ceph_free_cap_flush(prealloc_cf); return err; } diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 4347039ecc18..6706bde9ad1b 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -287,7 +287,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode, return 0; spin_lock(&ctx->flc_lock); - list_for_each_entry(lock, &ctx->flc_flock, fl_list) { + list_for_each_entry(lock, &ctx->flc_posix, fl_list) { ++seen_fcntl; if (seen_fcntl > num_fcntl_locks) { err = -ENOSPC; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 84f37f34f9aa..6aa07af67603 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -8,6 +8,7 @@ #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/utsname.h> +#include <linux/ratelimit.h> #include "super.h" #include "mds_client.h" @@ -458,7 +459,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_cap_reconnect = 0; s->s_cap_iterator = NULL; INIT_LIST_HEAD(&s->s_cap_releases); - INIT_LIST_HEAD(&s->s_cap_releases_done); INIT_LIST_HEAD(&s->s_cap_flushing); INIT_LIST_HEAD(&s->s_cap_snaps_flushing); @@ -629,6 +629,9 @@ static void __register_request(struct ceph_mds_client *mdsc, req->r_uid = current_fsuid(); req->r_gid = current_fsgid(); + if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK) + mdsc->oldest_tid = req->r_tid; + if (dir) { struct ceph_inode_info *ci = ceph_inode(dir); @@ -644,6 +647,21 @@ static void __unregister_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { dout("__unregister_request %p tid %lld\n", req, req->r_tid); + + if (req->r_tid == mdsc->oldest_tid) { + struct rb_node *p = rb_next(&req->r_node); + mdsc->oldest_tid = 0; + while (p) { + struct ceph_mds_request *next_req = + rb_entry(p, struct ceph_mds_request, r_node); + if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) { + mdsc->oldest_tid = next_req->r_tid; + break; + } + p = rb_next(p); + } + } + rb_erase(&req->r_node, &mdsc->request_tree); RB_CLEAR_NODE(&req->r_node); @@ -998,27 +1016,25 @@ void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, * session caps */ -/* - * Free preallocated cap messages assigned to this session - */ -static void cleanup_cap_releases(struct ceph_mds_session *session) +/* caller holds s_cap_lock, we drop it */ +static void cleanup_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) + __releases(session->s_cap_lock) { - struct ceph_msg *msg; + LIST_HEAD(tmp_list); + list_splice_init(&session->s_cap_releases, &tmp_list); + session->s_num_cap_releases = 0; + spin_unlock(&session->s_cap_lock); - spin_lock(&session->s_cap_lock); - while (!list_empty(&session->s_cap_releases)) { - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); - list_del_init(&msg->list_head); - ceph_msg_put(msg); - } - while (!list_empty(&session->s_cap_releases_done)) { - msg = list_first_entry(&session->s_cap_releases_done, - struct ceph_msg, list_head); - list_del_init(&msg->list_head); - ceph_msg_put(msg); + dout("cleanup_cap_releases mds%d\n", session->s_mds); + while (!list_empty(&tmp_list)) { + struct ceph_cap *cap; + /* zero out the in-progress message */ + cap = list_first_entry(&tmp_list, + struct ceph_cap, session_caps); + list_del(&cap->session_caps); + ceph_put_cap(mdsc, cap); } - spin_unlock(&session->s_cap_lock); } static void cleanup_session_requests(struct ceph_mds_client *mdsc, @@ -1033,7 +1049,8 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, req = list_first_entry(&session->s_unsafe, struct ceph_mds_request, r_unsafe_item); list_del_init(&req->r_unsafe_item); - pr_info(" dropping unsafe request %llu\n", req->r_tid); + pr_warn_ratelimited(" dropping unsafe request %llu\n", + req->r_tid); __unregister_request(mdsc, req); } /* zero r_attempts, so kick_requests() will re-send requests */ @@ -1095,10 +1112,16 @@ static int iterate_session_caps(struct ceph_mds_session *session, dout("iterate_session_caps finishing cap %p removal\n", cap); BUG_ON(cap->session != session); + cap->session = NULL; list_del_init(&cap->session_caps); session->s_nr_caps--; - cap->session = NULL; - old_cap = cap; /* put_cap it w/o locks held */ + if (cap->queue_release) { + list_add_tail(&cap->session_caps, + &session->s_cap_releases); + session->s_num_cap_releases++; + } else { + old_cap = cap; /* put_cap it w/o locks held */ + } } if (ret < 0) goto out; @@ -1119,6 +1142,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { struct ceph_inode_info *ci = ceph_inode(inode); + LIST_HEAD(to_remove); int drop = 0; dout("removing cap %p, ci is %p, inode is %p\n", @@ -1126,12 +1150,27 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, spin_lock(&ci->i_ceph_lock); __ceph_remove_cap(cap, false); if (!ci->i_auth_cap) { + struct ceph_cap_flush *cf; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + while (true) { + struct rb_node *n = rb_first(&ci->i_cap_flush_tree); + if (!n) + break; + cf = rb_entry(n, struct ceph_cap_flush, i_node); + rb_erase(&cf->i_node, &ci->i_cap_flush_tree); + list_add(&cf->list, &to_remove); + } + spin_lock(&mdsc->cap_dirty_lock); + + list_for_each_entry(cf, &to_remove, list) + rb_erase(&cf->g_node, &mdsc->cap_flush_tree); + if (!list_empty(&ci->i_dirty_item)) { - pr_info(" dropping dirty %s state for %p %lld\n", + pr_warn_ratelimited( + " dropping dirty %s state for %p %lld\n", ceph_cap_string(ci->i_dirty_caps), inode, ceph_ino(inode)); ci->i_dirty_caps = 0; @@ -1139,7 +1178,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, drop = 1; } if (!list_empty(&ci->i_flushing_item)) { - pr_info(" dropping dirty+flushing %s state for %p %lld\n", + pr_warn_ratelimited( + " dropping dirty+flushing %s state for %p %lld\n", ceph_cap_string(ci->i_flushing_caps), inode, ceph_ino(inode)); ci->i_flushing_caps = 0; @@ -1148,8 +1188,20 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, drop = 1; } spin_unlock(&mdsc->cap_dirty_lock); + + if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { + list_add(&ci->i_prealloc_cap_flush->list, &to_remove); + ci->i_prealloc_cap_flush = NULL; + } } spin_unlock(&ci->i_ceph_lock); + while (!list_empty(&to_remove)) { + struct ceph_cap_flush *cf; + cf = list_first_entry(&to_remove, + struct ceph_cap_flush, list); + list_del(&cf->list); + ceph_free_cap_flush(cf); + } while (drop--) iput(inode); return 0; @@ -1191,11 +1243,12 @@ static void remove_session_caps(struct ceph_mds_session *session) spin_lock(&session->s_cap_lock); } } - spin_unlock(&session->s_cap_lock); + + // drop cap expires and unlock s_cap_lock + cleanup_cap_releases(session->s_mdsc, session); BUG_ON(session->s_nr_caps > 0); BUG_ON(!list_empty(&session->s_cap_flushing)); - cleanup_cap_releases(session); } /* @@ -1371,7 +1424,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), ceph_cap_string(used), ceph_cap_string(wanted)); if (cap == ci->i_auth_cap) { - if (ci->i_dirty_caps | ci->i_flushing_caps) + if (ci->i_dirty_caps || ci->i_flushing_caps || + !list_empty(&ci->i_cap_snaps)) goto out; if ((used | wanted) & CEPH_CAP_ANY_WR) goto out; @@ -1417,121 +1471,80 @@ static int trim_caps(struct ceph_mds_client *mdsc, session->s_trim_caps = 0; } - ceph_add_cap_releases(mdsc, session); ceph_send_cap_releases(mdsc, session); return 0; } -/* - * Allocate cap_release messages. If there is a partially full message - * in the queue, try to allocate enough to cover it's remainder, so that - * we can send it immediately. - * - * Called under s_mutex. - */ -int ceph_add_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) +static int check_capsnap_flush(struct ceph_inode_info *ci, + u64 want_snap_seq) { - struct ceph_msg *msg, *partial = NULL; - struct ceph_mds_cap_release *head; - int err = -ENOMEM; - int extra = mdsc->fsc->mount_options->cap_release_safety; - int num; - - dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, - extra); - - spin_lock(&session->s_cap_lock); - - if (!list_empty(&session->s_cap_releases)) { - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, - list_head); - head = msg->front.iov_base; - num = le32_to_cpu(head->num); - if (num) { - dout(" partial %p with (%d/%d)\n", msg, num, - (int)CEPH_CAPS_PER_RELEASE); - extra += CEPH_CAPS_PER_RELEASE - num; - partial = msg; - } - } - while (session->s_num_cap_releases < session->s_nr_caps + extra) { - spin_unlock(&session->s_cap_lock); - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, - GFP_NOFS, false); - if (!msg) - goto out_unlocked; - dout("add_cap_releases %p msg %p now %d\n", session, msg, - (int)msg->front.iov_len); - head = msg->front.iov_base; - head->num = cpu_to_le32(0); - msg->front.iov_len = sizeof(*head); - spin_lock(&session->s_cap_lock); - list_add(&msg->list_head, &session->s_cap_releases); - session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE; - } - - if (partial) { - head = partial->front.iov_base; - num = le32_to_cpu(head->num); - dout(" queueing partial %p with %d/%d\n", partial, num, - (int)CEPH_CAPS_PER_RELEASE); - list_move_tail(&partial->list_head, - &session->s_cap_releases_done); - session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num; + int ret = 1; + spin_lock(&ci->i_ceph_lock); + if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) { + struct ceph_cap_snap *capsnap = + list_first_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, ci_item); + ret = capsnap->follows >= want_snap_seq; } - err = 0; - spin_unlock(&session->s_cap_lock); -out_unlocked: - return err; + spin_unlock(&ci->i_ceph_lock); + return ret; } -static int check_cap_flush(struct inode *inode, u64 want_flush_seq) +static int check_caps_flush(struct ceph_mds_client *mdsc, + u64 want_flush_tid) { - struct ceph_inode_info *ci = ceph_inode(inode); - int ret; - spin_lock(&ci->i_ceph_lock); - if (ci->i_flushing_caps) - ret = ci->i_cap_flush_seq >= want_flush_seq; - else - ret = 1; - spin_unlock(&ci->i_ceph_lock); + struct rb_node *n; + struct ceph_cap_flush *cf; + int ret = 1; + + spin_lock(&mdsc->cap_dirty_lock); + n = rb_first(&mdsc->cap_flush_tree); + cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL; + if (cf && cf->tid <= want_flush_tid) { + dout("check_caps_flush still flushing tid %llu <= %llu\n", + cf->tid, want_flush_tid); + ret = 0; + } + spin_unlock(&mdsc->cap_dirty_lock); return ret; } /* * flush all dirty inode data to disk. * - * returns true if we've flushed through want_flush_seq + * returns true if we've flushed through want_flush_tid */ -static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) +static void wait_caps_flush(struct ceph_mds_client *mdsc, + u64 want_flush_tid, u64 want_snap_seq) { int mds; - dout("check_cap_flush want %lld\n", want_flush_seq); + dout("check_caps_flush want %llu snap want %llu\n", + want_flush_tid, want_snap_seq); mutex_lock(&mdsc->mutex); - for (mds = 0; mds < mdsc->max_sessions; mds++) { + for (mds = 0; mds < mdsc->max_sessions; ) { struct ceph_mds_session *session = mdsc->sessions[mds]; struct inode *inode = NULL; - if (!session) + if (!session) { + mds++; continue; + } get_session(session); mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); - if (!list_empty(&session->s_cap_flushing)) { - struct ceph_inode_info *ci = - list_entry(session->s_cap_flushing.next, - struct ceph_inode_info, - i_flushing_item); - - if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) { - dout("check_cap_flush still flushing %p " - "seq %lld <= %lld to mds%d\n", - &ci->vfs_inode, ci->i_cap_flush_seq, - want_flush_seq, session->s_mds); + if (!list_empty(&session->s_cap_snaps_flushing)) { + struct ceph_cap_snap *capsnap = + list_first_entry(&session->s_cap_snaps_flushing, + struct ceph_cap_snap, + flushing_item); + struct ceph_inode_info *ci = capsnap->ci; + if (!check_capsnap_flush(ci, want_snap_seq)) { + dout("check_cap_flush still flushing snap %p " + "follows %lld <= %lld to mds%d\n", + &ci->vfs_inode, capsnap->follows, + want_snap_seq, mds); inode = igrab(&ci->vfs_inode); } } @@ -1540,15 +1553,21 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) if (inode) { wait_event(mdsc->cap_flushing_wq, - check_cap_flush(inode, want_flush_seq)); + check_capsnap_flush(ceph_inode(inode), + want_snap_seq)); iput(inode); + } else { + mds++; } mutex_lock(&mdsc->mutex); } - mutex_unlock(&mdsc->mutex); - dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); + + wait_event(mdsc->cap_flushing_wq, + check_caps_flush(mdsc, want_flush_tid)); + + dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid); } /* @@ -1557,60 +1576,74 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) void ceph_send_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { - struct ceph_msg *msg; + struct ceph_msg *msg = NULL; + struct ceph_mds_cap_release *head; + struct ceph_mds_cap_item *item; + struct ceph_cap *cap; + LIST_HEAD(tmp_list); + int num_cap_releases; - dout("send_cap_releases mds%d\n", session->s_mds); spin_lock(&session->s_cap_lock); - while (!list_empty(&session->s_cap_releases_done)) { - msg = list_first_entry(&session->s_cap_releases_done, - struct ceph_msg, list_head); - list_del_init(&msg->list_head); - spin_unlock(&session->s_cap_lock); - msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - dout("send_cap_releases mds%d %p\n", session->s_mds, msg); - ceph_con_send(&session->s_con, msg); - spin_lock(&session->s_cap_lock); - } +again: + list_splice_init(&session->s_cap_releases, &tmp_list); + num_cap_releases = session->s_num_cap_releases; + session->s_num_cap_releases = 0; spin_unlock(&session->s_cap_lock); -} -static void discard_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) -{ - struct ceph_msg *msg; - struct ceph_mds_cap_release *head; - unsigned num; - - dout("discard_cap_releases mds%d\n", session->s_mds); + while (!list_empty(&tmp_list)) { + if (!msg) { + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, + PAGE_CACHE_SIZE, GFP_NOFS, false); + if (!msg) + goto out_err; + head = msg->front.iov_base; + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + } + cap = list_first_entry(&tmp_list, struct ceph_cap, + session_caps); + list_del(&cap->session_caps); + num_cap_releases--; - if (!list_empty(&session->s_cap_releases)) { - /* zero out the in-progress message */ - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); head = msg->front.iov_base; - num = le32_to_cpu(head->num); - dout("discard_cap_releases mds%d %p %u\n", - session->s_mds, msg, num); - head->num = cpu_to_le32(0); - msg->front.iov_len = sizeof(*head); - session->s_num_cap_releases += num; + le32_add_cpu(&head->num, 1); + item = msg->front.iov_base + msg->front.iov_len; + item->ino = cpu_to_le64(cap->cap_ino); + item->cap_id = cpu_to_le64(cap->cap_id); + item->migrate_seq = cpu_to_le32(cap->mseq); + item->seq = cpu_to_le32(cap->issue_seq); + msg->front.iov_len += sizeof(*item); + + ceph_put_cap(mdsc, cap); + + if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + dout("send_cap_releases mds%d %p\n", session->s_mds, msg); + ceph_con_send(&session->s_con, msg); + msg = NULL; + } } - /* requeue completed messages */ - while (!list_empty(&session->s_cap_releases_done)) { - msg = list_first_entry(&session->s_cap_releases_done, - struct ceph_msg, list_head); - list_del_init(&msg->list_head); + BUG_ON(num_cap_releases != 0); - head = msg->front.iov_base; - num = le32_to_cpu(head->num); - dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, - num); - session->s_num_cap_releases += num; - head->num = cpu_to_le32(0); - msg->front.iov_len = sizeof(*head); - list_add(&msg->list_head, &session->s_cap_releases); + spin_lock(&session->s_cap_lock); + if (!list_empty(&session->s_cap_releases)) + goto again; + spin_unlock(&session->s_cap_lock); + + if (msg) { + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + dout("send_cap_releases mds%d %p\n", session->s_mds, msg); + ceph_con_send(&session->s_con, msg); } + return; +out_err: + pr_err("send_cap_releases mds%d, failed to allocate message\n", + session->s_mds); + spin_lock(&session->s_cap_lock); + list_splice(&tmp_list, &session->s_cap_releases); + session->s_num_cap_releases += num_cap_releases; + spin_unlock(&session->s_cap_lock); } /* @@ -1635,7 +1668,8 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, order = get_order(size * num_entries); while (order >= 0) { - rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN, + rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL | + __GFP_NOWARN, order); if (rinfo->dir_in) break; @@ -1697,13 +1731,9 @@ static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc) struct ceph_mds_request, r_node); } -static u64 __get_oldest_tid(struct ceph_mds_client *mdsc) +static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc) { - struct ceph_mds_request *req = __get_oldest_req(mdsc); - - if (req) - return req->r_tid; - return 0; + return mdsc->oldest_tid; } /* @@ -2267,15 +2297,18 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, /* wait */ mutex_unlock(&mdsc->mutex); dout("do_request waiting\n"); - if (req->r_timeout) { - err = (long)wait_for_completion_killable_timeout( - &req->r_completion, req->r_timeout); - if (err == 0) - err = -EIO; - } else if (req->r_wait_for_completion) { + if (!req->r_timeout && req->r_wait_for_completion) { err = req->r_wait_for_completion(mdsc, req); } else { - err = wait_for_completion_killable(&req->r_completion); + long timeleft = wait_for_completion_killable_timeout( + &req->r_completion, + ceph_timeout_jiffies(req->r_timeout)); + if (timeleft > 0) + err = 0; + else if (!timeleft) + err = -EIO; /* timed out */ + else + err = timeleft; /* killed */ } dout("do_request waited, got %d\n", err); mutex_lock(&mdsc->mutex); @@ -2496,7 +2529,6 @@ out_err: } mutex_unlock(&mdsc->mutex); - ceph_add_cap_releases(mdsc, req->r_session); mutex_unlock(&session->s_mutex); /* kick calling process */ @@ -2888,8 +2920,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, */ session->s_cap_reconnect = 1; /* drop old cap expires; we're about to reestablish that state */ - discard_cap_releases(mdsc, session); - spin_unlock(&session->s_cap_lock); + cleanup_cap_releases(mdsc, session); /* trim unused caps to reduce MDS's cache rejoin time */ if (mdsc->fsc->sb->s_root) @@ -2956,6 +2987,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, reply->hdr.data_len = cpu_to_le32(pagelist->length); ceph_msg_data_add_pagelist(reply, pagelist); + + ceph_early_kick_flushing_caps(mdsc, session); + ceph_con_send(&session->s_con, reply); mutex_unlock(&session->s_mutex); @@ -3352,7 +3386,6 @@ static void delayed_work(struct work_struct *work) send_renew_caps(mdsc, s); else ceph_con_keepalive(&s->s_con); - ceph_add_cap_releases(mdsc, s); if (s->s_state == CEPH_MDS_SESSION_OPEN || s->s_state == CEPH_MDS_SESSION_HUNG) ceph_send_cap_releases(mdsc, s); @@ -3390,11 +3423,13 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) atomic_set(&mdsc->num_sessions, 0); mdsc->max_sessions = 0; mdsc->stopping = 0; + mdsc->last_snap_seq = 0; init_rwsem(&mdsc->snap_rwsem); mdsc->snap_realms = RB_ROOT; INIT_LIST_HEAD(&mdsc->snap_empty); spin_lock_init(&mdsc->snap_empty_lock); mdsc->last_tid = 0; + mdsc->oldest_tid = 0; mdsc->request_tree = RB_ROOT; INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); mdsc->last_renew_caps = jiffies; @@ -3402,7 +3437,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) spin_lock_init(&mdsc->cap_delay_lock); INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); - mdsc->cap_flush_seq = 0; + mdsc->last_cap_flush_tid = 1; + mdsc->cap_flush_tree = RB_ROOT; INIT_LIST_HEAD(&mdsc->cap_dirty); INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); mdsc->num_cap_flushing = 0; @@ -3414,6 +3450,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) ceph_caps_init(mdsc); ceph_adjust_min_caps(mdsc, fsc->min_caps); + init_rwsem(&mdsc->pool_perm_rwsem); + mdsc->pool_perm_tree = RB_ROOT; + return 0; } @@ -3423,8 +3462,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) */ static void wait_requests(struct ceph_mds_client *mdsc) { + struct ceph_options *opts = mdsc->fsc->client->options; struct ceph_mds_request *req; - struct ceph_fs_client *fsc = mdsc->fsc; mutex_lock(&mdsc->mutex); if (__get_oldest_req(mdsc)) { @@ -3432,7 +3471,7 @@ static void wait_requests(struct ceph_mds_client *mdsc) dout("wait_requests waiting for requests\n"); wait_for_completion_timeout(&mdsc->safe_umount_waiters, - fsc->client->options->mount_timeout * HZ); + ceph_timeout_jiffies(opts->mount_timeout)); /* tear down remaining requests */ mutex_lock(&mdsc->mutex); @@ -3485,7 +3524,8 @@ restart: nextreq = rb_entry(n, struct ceph_mds_request, r_node); else nextreq = NULL; - if ((req->r_op & CEPH_MDS_OP_WRITE)) { + if (req->r_op != CEPH_MDS_OP_SETFILELOCK && + (req->r_op & CEPH_MDS_OP_WRITE)) { /* write op */ ceph_mdsc_get_request(req); if (nextreq) @@ -3513,7 +3553,7 @@ restart: void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { - u64 want_tid, want_flush; + u64 want_tid, want_flush, want_snap; if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) return; @@ -3525,13 +3565,18 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) ceph_flush_dirty_caps(mdsc); spin_lock(&mdsc->cap_dirty_lock); - want_flush = mdsc->cap_flush_seq; + want_flush = mdsc->last_cap_flush_tid; spin_unlock(&mdsc->cap_dirty_lock); - dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); + down_read(&mdsc->snap_rwsem); + want_snap = mdsc->last_snap_seq; + up_read(&mdsc->snap_rwsem); + + dout("sync want tid %lld flush_seq %lld snap_seq %lld\n", + want_tid, want_flush, want_snap); wait_unsafe_requests(mdsc, want_tid); - wait_caps_flush(mdsc, want_flush); + wait_caps_flush(mdsc, want_flush, want_snap); } /* @@ -3549,10 +3594,9 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc) */ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) { + struct ceph_options *opts = mdsc->fsc->client->options; struct ceph_mds_session *session; int i; - struct ceph_fs_client *fsc = mdsc->fsc; - unsigned long timeout = fsc->client->options->mount_timeout * HZ; dout("close_sessions\n"); @@ -3573,7 +3617,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) dout("waiting for sessions to close\n"); wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc), - timeout); + ceph_timeout_jiffies(opts->mount_timeout)); /* tear down remaining sessions */ mutex_lock(&mdsc->mutex); @@ -3607,6 +3651,7 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) ceph_mdsmap_destroy(mdsc->mdsmap); kfree(mdsc->sessions); ceph_caps_finalize(mdsc); + ceph_pool_perm_destroy(mdsc); } void ceph_mdsc_destroy(struct ceph_fs_client *fsc) diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 1875b5d985c6..762757e6cebf 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -139,7 +139,6 @@ struct ceph_mds_session { int s_cap_reconnect; int s_readonly; struct list_head s_cap_releases; /* waiting cap_release messages */ - struct list_head s_cap_releases_done; /* ready to send */ struct ceph_cap *s_cap_iterator; /* protected by mutex */ @@ -228,7 +227,7 @@ struct ceph_mds_request { int r_err; bool r_aborted; - unsigned long r_timeout; /* optional. jiffies */ + unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ unsigned long r_started; /* start time to measure timeout against */ unsigned long r_request_started; /* start time for mds request only, used to measure lease durations */ @@ -254,12 +253,21 @@ struct ceph_mds_request { bool r_got_unsafe, r_got_safe, r_got_result; bool r_did_prepopulate; + long long r_dir_release_cnt; + long long r_dir_ordered_cnt; + int r_readdir_cache_idx; u32 r_readdir_offset; struct ceph_cap_reservation r_caps_reservation; int r_num_caps; }; +struct ceph_pool_perm { + struct rb_node node; + u32 pool; + int perm; +}; + /* * mds client state */ @@ -284,12 +292,15 @@ struct ceph_mds_client { * references (implying they contain no inodes with caps) that * should be destroyed. */ + u64 last_snap_seq; struct rw_semaphore snap_rwsem; struct rb_root snap_realms; struct list_head snap_empty; spinlock_t snap_empty_lock; /* protect snap_empty */ u64 last_tid; /* most recent mds request */ + u64 oldest_tid; /* oldest incomplete mds request, + excluding setfilelock requests */ struct rb_root request_tree; /* pending mds requests */ struct delayed_work delayed_work; /* delayed work */ unsigned long last_renew_caps; /* last time we renewed our caps */ @@ -298,7 +309,8 @@ struct ceph_mds_client { struct list_head snap_flush_list; /* cap_snaps ready to flush */ spinlock_t snap_flush_lock; - u64 cap_flush_seq; + u64 last_cap_flush_tid; + struct rb_root cap_flush_tree; struct list_head cap_dirty; /* inodes with dirty caps */ struct list_head cap_dirty_migrating; /* ...that are migration... */ int num_cap_flushing; /* # caps we are flushing */ @@ -328,6 +340,9 @@ struct ceph_mds_client { spinlock_t dentry_lru_lock; struct list_head dentry_lru; int num_dentry; + + struct rw_semaphore pool_perm_rwsem; + struct rb_root pool_perm_tree; }; extern const char *ceph_mds_op_name(int op); @@ -379,8 +394,6 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) kref_put(&req->r_kref, ceph_mdsc_release_request); } -extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session); extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index a97e39f09ba6..233d906aec02 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -296,7 +296,7 @@ static int cmpu64_rev(const void *a, const void *b) } -static struct ceph_snap_context *empty_snapc; +struct ceph_snap_context *ceph_empty_snapc; /* * build the snap context for a given realm. @@ -338,9 +338,9 @@ static int build_snap_context(struct ceph_snap_realm *realm) return 0; } - if (num == 0 && realm->seq == empty_snapc->seq) { - ceph_get_snap_context(empty_snapc); - snapc = empty_snapc; + if (num == 0 && realm->seq == ceph_empty_snapc->seq) { + ceph_get_snap_context(ceph_empty_snapc); + snapc = ceph_empty_snapc; goto done; } @@ -436,6 +436,14 @@ static int dup_array(u64 **dst, __le64 *src, u32 num) return 0; } +static bool has_new_snaps(struct ceph_snap_context *o, + struct ceph_snap_context *n) +{ + if (n->num_snaps == 0) + return false; + /* snaps are in descending order */ + return n->snaps[0] > o->seq; +} /* * When a snapshot is applied, the size/mtime inode metadata is queued @@ -455,6 +463,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) { struct inode *inode = &ci->vfs_inode; struct ceph_cap_snap *capsnap; + struct ceph_snap_context *old_snapc, *new_snapc; int used, dirty; capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); @@ -467,6 +476,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); + old_snapc = ci->i_head_snapc; + new_snapc = ci->i_snap_realm->cached_context; + /* * If there is a write in progress, treat that as a dirty Fw, * even though it hasn't completed yet; by the time we finish @@ -481,76 +493,95 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) writes in progress now were started before the previous cap_snap. lucky us. */ dout("queue_cap_snap %p already pending\n", inode); - kfree(capsnap); - } else if (ci->i_snap_realm->cached_context == empty_snapc) { - dout("queue_cap_snap %p empty snapc\n", inode); - kfree(capsnap); - } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| - CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { - struct ceph_snap_context *snapc = ci->i_head_snapc; - - /* - * if we are a sync write, we may need to go to the snaprealm - * to get the current snapc. - */ - if (!snapc) - snapc = ci->i_snap_realm->cached_context; + goto update_snapc; + } + if (ci->i_wrbuffer_ref_head == 0 && + !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) { + dout("queue_cap_snap %p nothing dirty|writing\n", inode); + goto update_snapc; + } - dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n", - inode, capsnap, snapc, ceph_cap_string(dirty)); - ihold(inode); + BUG_ON(!old_snapc); - atomic_set(&capsnap->nref, 1); - capsnap->ci = ci; - INIT_LIST_HEAD(&capsnap->ci_item); - INIT_LIST_HEAD(&capsnap->flushing_item); - - capsnap->follows = snapc->seq; - capsnap->issued = __ceph_caps_issued(ci, NULL); - capsnap->dirty = dirty; - - capsnap->mode = inode->i_mode; - capsnap->uid = inode->i_uid; - capsnap->gid = inode->i_gid; - - if (dirty & CEPH_CAP_XATTR_EXCL) { - __ceph_build_xattrs_blob(ci); - capsnap->xattr_blob = - ceph_buffer_get(ci->i_xattrs.blob); - capsnap->xattr_version = ci->i_xattrs.version; - } else { - capsnap->xattr_blob = NULL; - capsnap->xattr_version = 0; + /* + * There is no need to send FLUSHSNAP message to MDS if there is + * no new snapshot. But when there is dirty pages or on-going + * writes, we still need to create cap_snap. cap_snap is needed + * by the write path and page writeback path. + * + * also see ceph_try_drop_cap_snap() + */ + if (has_new_snaps(old_snapc, new_snapc)) { + if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR)) + capsnap->need_flush = true; + } else { + if (!(used & CEPH_CAP_FILE_WR) && + ci->i_wrbuffer_ref_head == 0) { + dout("queue_cap_snap %p " + "no new_snap|dirty_page|writing\n", inode); + goto update_snapc; } + } - capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; - - /* dirty page count moved from _head to this cap_snap; - all subsequent writes page dirties occur _after_ this - snapshot. */ - capsnap->dirty_pages = ci->i_wrbuffer_ref_head; - ci->i_wrbuffer_ref_head = 0; - capsnap->context = snapc; - ci->i_head_snapc = - ceph_get_snap_context(ci->i_snap_realm->cached_context); - dout(" new snapc is %p\n", ci->i_head_snapc); - list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); - - if (used & CEPH_CAP_FILE_WR) { - dout("queue_cap_snap %p cap_snap %p snapc %p" - " seq %llu used WR, now pending\n", inode, - capsnap, snapc, snapc->seq); - capsnap->writing = 1; - } else { - /* note mtime, size NOW. */ - __ceph_finish_cap_snap(ci, capsnap); - } + dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n", + inode, capsnap, old_snapc, ceph_cap_string(dirty), + capsnap->need_flush ? "" : "no_flush"); + ihold(inode); + + atomic_set(&capsnap->nref, 1); + capsnap->ci = ci; + INIT_LIST_HEAD(&capsnap->ci_item); + INIT_LIST_HEAD(&capsnap->flushing_item); + + capsnap->follows = old_snapc->seq; + capsnap->issued = __ceph_caps_issued(ci, NULL); + capsnap->dirty = dirty; + + capsnap->mode = inode->i_mode; + capsnap->uid = inode->i_uid; + capsnap->gid = inode->i_gid; + + if (dirty & CEPH_CAP_XATTR_EXCL) { + __ceph_build_xattrs_blob(ci); + capsnap->xattr_blob = + ceph_buffer_get(ci->i_xattrs.blob); + capsnap->xattr_version = ci->i_xattrs.version; } else { - dout("queue_cap_snap %p nothing dirty|writing\n", inode); - kfree(capsnap); + capsnap->xattr_blob = NULL; + capsnap->xattr_version = 0; } + capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; + + /* dirty page count moved from _head to this cap_snap; + all subsequent writes page dirties occur _after_ this + snapshot. */ + capsnap->dirty_pages = ci->i_wrbuffer_ref_head; + ci->i_wrbuffer_ref_head = 0; + capsnap->context = old_snapc; + list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); + old_snapc = NULL; + + if (used & CEPH_CAP_FILE_WR) { + dout("queue_cap_snap %p cap_snap %p snapc %p" + " seq %llu used WR, now pending\n", inode, + capsnap, old_snapc, old_snapc->seq); + capsnap->writing = 1; + } else { + /* note mtime, size NOW. */ + __ceph_finish_cap_snap(ci, capsnap); + } + capsnap = NULL; + +update_snapc: + if (ci->i_head_snapc) { + ci->i_head_snapc = ceph_get_snap_context(new_snapc); + dout(" new snapc is %p\n", new_snapc); + } spin_unlock(&ci->i_ceph_lock); + + kfree(capsnap); + ceph_put_snap_context(old_snapc); } /* @@ -699,6 +730,8 @@ more: /* queue realm for cap_snap creation */ list_add(&realm->dirty_item, &dirty_realms); + if (realm->seq > mdsc->last_snap_seq) + mdsc->last_snap_seq = realm->seq; invalidate = 1; } else if (!realm->cached_context) { @@ -964,14 +997,14 @@ out: int __init ceph_snap_init(void) { - empty_snapc = ceph_create_snap_context(0, GFP_NOFS); - if (!empty_snapc) + ceph_empty_snapc = ceph_create_snap_context(0, GFP_NOFS); + if (!ceph_empty_snapc) return -ENOMEM; - empty_snapc->seq = 1; + ceph_empty_snapc->seq = 1; return 0; } void ceph_snap_exit(void) { - ceph_put_snap_context(empty_snapc); + ceph_put_snap_context(ceph_empty_snapc); } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 4e9905374078..7b6bfcbf801c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -134,10 +134,12 @@ enum { Opt_noino32, Opt_fscache, Opt_nofscache, + Opt_poolperm, + Opt_nopoolperm, #ifdef CONFIG_CEPH_FS_POSIX_ACL Opt_acl, #endif - Opt_noacl + Opt_noacl, }; static match_table_t fsopt_tokens = { @@ -165,6 +167,8 @@ static match_table_t fsopt_tokens = { {Opt_noino32, "noino32"}, {Opt_fscache, "fsc"}, {Opt_nofscache, "nofsc"}, + {Opt_poolperm, "poolperm"}, + {Opt_nopoolperm, "nopoolperm"}, #ifdef CONFIG_CEPH_FS_POSIX_ACL {Opt_acl, "acl"}, #endif @@ -268,6 +272,13 @@ static int parse_fsopt_token(char *c, void *private) case Opt_nofscache: fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; break; + case Opt_poolperm: + fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM; + printk ("pool perm"); + break; + case Opt_nopoolperm: + fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM; + break; #ifdef CONFIG_CEPH_FS_POSIX_ACL case Opt_acl: fsopt->sb_flags |= MS_POSIXACL; @@ -436,6 +447,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",nodcache"); if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) seq_puts(m, ",fsc"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM) + seq_puts(m, ",nopoolperm"); #ifdef CONFIG_CEPH_FS_POSIX_ACL if (fsopt->sb_flags & MS_POSIXACL) @@ -466,7 +479,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) - seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name); + seq_show_option(m, "snapdirname", fsopt->snapdir_name); return 0; } @@ -609,6 +622,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) */ struct kmem_cache *ceph_inode_cachep; struct kmem_cache *ceph_cap_cachep; +struct kmem_cache *ceph_cap_flush_cachep; struct kmem_cache *ceph_dentry_cachep; struct kmem_cache *ceph_file_cachep; @@ -634,6 +648,10 @@ static int __init init_caches(void) SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); if (ceph_cap_cachep == NULL) goto bad_cap; + ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_cap_flush_cachep == NULL) + goto bad_cap_flush; ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); @@ -652,6 +670,8 @@ static int __init init_caches(void) bad_file: kmem_cache_destroy(ceph_dentry_cachep); bad_dentry: + kmem_cache_destroy(ceph_cap_flush_cachep); +bad_cap_flush: kmem_cache_destroy(ceph_cap_cachep); bad_cap: kmem_cache_destroy(ceph_inode_cachep); @@ -668,6 +688,7 @@ static void destroy_caches(void) kmem_cache_destroy(ceph_inode_cachep); kmem_cache_destroy(ceph_cap_cachep); + kmem_cache_destroy(ceph_cap_flush_cachep); kmem_cache_destroy(ceph_dentry_cachep); kmem_cache_destroy(ceph_file_cachep); @@ -729,7 +750,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, req->r_ino1.ino = CEPH_INO_ROOT; req->r_ino1.snap = CEPH_NOSNAP; req->r_started = started; - req->r_timeout = fsc->client->options->mount_timeout * HZ; + req->r_timeout = fsc->client->options->mount_timeout; req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index fa20e1318939..2f2460d23a06 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -35,6 +35,7 @@ #define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ #define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ #define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ +#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */ #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES | \ CEPH_MOUNT_OPT_DCACHE) @@ -121,11 +122,21 @@ struct ceph_cap { struct rb_node ci_node; /* per-ci cap tree */ struct ceph_mds_session *session; struct list_head session_caps; /* per-session caplist */ - int mds; u64 cap_id; /* unique cap id (mds provided) */ - int issued; /* latest, from the mds */ - int implemented; /* implemented superset of issued (for revocation) */ - int mds_wanted; + union { + /* in-use caps */ + struct { + int issued; /* latest, from the mds */ + int implemented; /* implemented superset of + issued (for revocation) */ + int mds, mds_wanted; + }; + /* caps to release */ + struct { + u64 cap_ino; + int queue_release; + }; + }; u32 seq, issue_seq, mseq; u32 cap_gen; /* active/stale cycle */ unsigned long last_used; @@ -163,6 +174,7 @@ struct ceph_cap_snap { int writing; /* a sync write is still in progress */ int dirty_pages; /* dirty pages awaiting writeback */ bool inline_data; + bool need_flush; }; static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) @@ -174,6 +186,16 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) } } +struct ceph_cap_flush { + u64 tid; + int caps; + struct rb_node g_node; // global + union { + struct rb_node i_node; // inode + struct list_head list; + }; +}; + /* * The frag tree describes how a directory is fragmented, potentially across * multiple metadata servers. It is also used to indicate points where @@ -259,9 +281,9 @@ struct ceph_inode_info { u32 i_time_warp_seq; unsigned i_ceph_flags; - int i_ordered_count; - atomic_t i_release_count; - atomic_t i_complete_count; + atomic64_t i_release_count; + atomic64_t i_ordered_count; + atomic64_t i_complete_seq[2]; struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; @@ -283,11 +305,11 @@ struct ceph_inode_info { struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ struct list_head i_dirty_item, i_flushing_item; - u64 i_cap_flush_seq; /* we need to track cap writeback on a per-cap-bit basis, to allow * overlapping, pipelined cap flushes to the mds. we can probably * reduce the tid to 8 bits if we're concerned about inode size. */ - u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS]; + struct ceph_cap_flush *i_prealloc_cap_flush; + struct rb_root i_cap_flush_tree; wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ unsigned long i_hold_caps_min; /* jiffies */ unsigned long i_hold_caps_max; /* jiffies */ @@ -438,36 +460,46 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, /* * Ceph inode. */ -#define CEPH_I_DIR_ORDERED 1 /* dentries in dir are ordered */ -#define CEPH_I_NODELAY 4 /* do not delay cap release */ -#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ -#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ +#define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ +#define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */ +#define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ +#define CEPH_I_NOFLUSH (1 << 3) /* do not flush dirty caps */ +#define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */ +#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ +#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ + static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, - int release_count, int ordered_count) + long long release_count, + long long ordered_count) { - atomic_set(&ci->i_complete_count, release_count); - if (ci->i_ordered_count == ordered_count) - ci->i_ceph_flags |= CEPH_I_DIR_ORDERED; - else - ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; + smp_mb__before_atomic(); + atomic64_set(&ci->i_complete_seq[0], release_count); + atomic64_set(&ci->i_complete_seq[1], ordered_count); } static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) { - atomic_inc(&ci->i_release_count); + atomic64_inc(&ci->i_release_count); +} + +static inline void __ceph_dir_clear_ordered(struct ceph_inode_info *ci) +{ + atomic64_inc(&ci->i_ordered_count); } static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) { - return atomic_read(&ci->i_complete_count) == - atomic_read(&ci->i_release_count); + return atomic64_read(&ci->i_complete_seq[0]) == + atomic64_read(&ci->i_release_count); } static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci) { - return __ceph_dir_is_complete(ci) && - (ci->i_ceph_flags & CEPH_I_DIR_ORDERED); + return atomic64_read(&ci->i_complete_seq[0]) == + atomic64_read(&ci->i_release_count) && + atomic64_read(&ci->i_complete_seq[1]) == + atomic64_read(&ci->i_ordered_count); } static inline void ceph_dir_clear_complete(struct inode *inode) @@ -477,20 +509,13 @@ static inline void ceph_dir_clear_complete(struct inode *inode) static inline void ceph_dir_clear_ordered(struct inode *inode) { - struct ceph_inode_info *ci = ceph_inode(inode); - spin_lock(&ci->i_ceph_lock); - ci->i_ordered_count++; - ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; - spin_unlock(&ci->i_ceph_lock); + __ceph_dir_clear_ordered(ceph_inode(inode)); } static inline bool ceph_dir_is_complete_ordered(struct inode *inode) { - struct ceph_inode_info *ci = ceph_inode(inode); - bool ret; - spin_lock(&ci->i_ceph_lock); - ret = __ceph_dir_is_complete_ordered(ci); - spin_unlock(&ci->i_ceph_lock); + bool ret = __ceph_dir_is_complete_ordered(ceph_inode(inode)); + smp_rmb(); return ret; } @@ -552,7 +577,10 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci) { return ci->i_dirty_caps | ci->i_flushing_caps; } -extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); +extern struct ceph_cap_flush *ceph_alloc_cap_flush(void); +extern void ceph_free_cap_flush(struct ceph_cap_flush *cf); +extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, + struct ceph_cap_flush **pcf); extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, struct ceph_cap *ocap, int mask); @@ -606,16 +634,20 @@ struct ceph_file_info { unsigned offset; /* offset of last chunk, adjusted for . and .. */ unsigned next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ - struct dentry *dentry; /* next dentry (for dcache readdir) */ - int dir_release_count; - int dir_ordered_count; + long long dir_release_count; + long long dir_ordered_count; + int readdir_cache_idx; /* used for -o dirstat read() on directory thing */ char *dir_info; int dir_info_len; }; - +struct ceph_readdir_cache_control { + struct page *page; + struct dentry **dentries; + int index; +}; /* * A "snap realm" describes a subset of the file hierarchy sharing @@ -687,6 +719,7 @@ static inline int default_congestion_kb(void) /* snap.c */ +extern struct ceph_snap_context *ceph_empty_snapc; struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, u64 ino); extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc, @@ -713,8 +746,8 @@ extern void ceph_snap_exit(void); static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) { return !list_empty(&ci->i_cap_snaps) && - list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap, - ci_item)->writing; + list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap, + ci_item)->writing; } /* inode.c */ @@ -838,12 +871,12 @@ extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); extern int ceph_is_any_caps(struct inode *inode); -extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino, - u64 cap_id, u32 migrate_seq, u32 issue_seq); extern void ceph_queue_caps_release(struct inode *inode); extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); extern int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync); +extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, @@ -879,6 +912,9 @@ extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); /* addr.c */ extern const struct address_space_operations ceph_aops; extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); +extern int ceph_uninline_data(struct file *filp, struct page *locked_page); +extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need); +extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); /* file.c */ extern const struct file_operations ceph_file_fops; @@ -890,7 +926,6 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, extern int ceph_release(struct inode *inode, struct file *filp); extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, char *data, size_t len); -int ceph_uninline_data(struct file *filp, struct page *locked_page); /* dir.c */ extern const struct file_operations ceph_dir_fops; extern const struct file_operations ceph_snapdir_fops; @@ -911,6 +946,7 @@ extern void ceph_dentry_lru_del(struct dentry *dn); extern void ceph_invalidate_dentry_lease(struct dentry *dentry); extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry); +extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl); /* * our d_ops vary depending on whether the inode is live, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index cd7ffad4041d..819163d8313b 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -911,6 +911,8 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, struct inode *inode = d_inode(dentry); struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; + struct ceph_cap_flush *prealloc_cf = NULL; int issued; int err; int dirty = 0; @@ -920,6 +922,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, char *newval = NULL; struct ceph_inode_xattr *xattr = NULL; int required_blob_size; + bool lock_snap_rwsem = false; if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; @@ -948,12 +951,27 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, if (!xattr) goto out; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + goto out; + spin_lock(&ci->i_ceph_lock); retry: issued = __ceph_caps_issued(ci, NULL); - dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; + + if (!lock_snap_rwsem && !ci->i_head_snapc) { + lock_snap_rwsem = true; + if (!down_read_trylock(&mdsc->snap_rwsem)) { + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + goto retry; + } + } + + dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); __build_xattrs(inode); required_blob_size = __get_required_blob_size(ci, name_len, val_len); @@ -966,7 +984,7 @@ retry: dout(" preaallocating new blob size=%d\n", required_blob_size); blob = ceph_buffer_new(required_blob_size, GFP_NOFS); if (!blob) - goto out; + goto do_sync_unlocked; spin_lock(&ci->i_ceph_lock); if (ci->i_xattrs.prealloc_blob) ceph_buffer_put(ci->i_xattrs.prealloc_blob); @@ -978,21 +996,28 @@ retry: flags, value ? 1 : -1, &xattr); if (!err) { - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, + &prealloc_cf); ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; } spin_unlock(&ci->i_ceph_lock); + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); if (dirty) __mark_inode_dirty(inode, dirty); + ceph_free_cap_flush(prealloc_cf); return err; do_sync: spin_unlock(&ci->i_ceph_lock); do_sync_unlocked: + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); err = ceph_sync_setxattr(dentry, name, value, size, flags); out: + ceph_free_cap_flush(prealloc_cf); kfree(newname); kfree(newval); kfree(xattr); @@ -1044,10 +1069,13 @@ int __ceph_removexattr(struct dentry *dentry, const char *name) struct inode *inode = d_inode(dentry); struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; + struct ceph_cap_flush *prealloc_cf = NULL; int issued; int err; int required_blob_size; int dirty; + bool lock_snap_rwsem = false; if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; @@ -1060,14 +1088,29 @@ int __ceph_removexattr(struct dentry *dentry, const char *name) if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) goto do_sync_unlocked; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + err = -ENOMEM; spin_lock(&ci->i_ceph_lock); retry: issued = __ceph_caps_issued(ci, NULL); - dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); - if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; + + if (!lock_snap_rwsem && !ci->i_head_snapc) { + lock_snap_rwsem = true; + if (!down_read_trylock(&mdsc->snap_rwsem)) { + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + goto retry; + } + } + + dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); + __build_xattrs(inode); required_blob_size = __get_required_blob_size(ci, 0, 0); @@ -1080,7 +1123,7 @@ retry: dout(" preaallocating new blob size=%d\n", required_blob_size); blob = ceph_buffer_new(required_blob_size, GFP_NOFS); if (!blob) - goto out; + goto do_sync_unlocked; spin_lock(&ci->i_ceph_lock); if (ci->i_xattrs.prealloc_blob) ceph_buffer_put(ci->i_xattrs.prealloc_blob); @@ -1090,18 +1133,24 @@ retry: err = __remove_xattr_by_name(ceph_inode(inode), name); - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, + &prealloc_cf); ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; spin_unlock(&ci->i_ceph_lock); + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); if (dirty) __mark_inode_dirty(inode, dirty); + ceph_free_cap_flush(prealloc_cf); return err; do_sync: spin_unlock(&ci->i_ceph_lock); do_sync_unlocked: + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); + ceph_free_cap_flush(prealloc_cf); err = ceph_send_removexattr(dentry, name); -out: return err; } diff --git a/fs/char_dev.c b/fs/char_dev.c index ea06a3d0364c..24b142569ca9 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -274,7 +274,7 @@ out2: } /** - * unregister_chrdev_region() - return a range of device numbers + * unregister_chrdev_region() - unregister a range of device numbers * @from: the first in the range of numbers to unregister * @count: the number of device numbers to unregister * diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index a2172f3f69e3..e7b478b49985 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -192,6 +192,15 @@ config CIFS_SMB2 options are also slightly simpler (compared to CIFS) due to protocol improvements. +config CIFS_SMB311 + bool "SMB3.1.1 network file system support (Experimental)" + depends on CIFS_SMB2 && INET + + help + This enables experimental support for the newest, SMB3.1.1, dialect. + This dialect includes improved security negotiation features. + If unsure, say N + config CIFS_FSCACHE bool "Provide CIFS client caching support" depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h new file mode 100644 index 000000000000..0065256881d8 --- /dev/null +++ b/fs/cifs/cifs_ioctl.h @@ -0,0 +1,42 @@ +/* + * fs/cifs/cifs_ioctl.h + * + * Structure definitions for io control for cifs/smb3 + * + * Copyright (c) 2015 Steve French <steve.french@primarydata.com> + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + */ + +struct smb_mnt_fs_info { + __u32 version; /* 0001 */ + __u16 protocol_id; + __u16 tcon_flags; + __u32 vol_serial_number; + __u32 vol_create_time; + __u32 share_caps; + __u32 share_flags; + __u32 sector_flags; + __u32 optimal_sector_size; + __u32 max_bytes_chunk; + __u32 fs_attributes; + __u32 max_path_component; + __u32 device_type; + __u32 device_characteristics; + __u32 maximal_access; + __u64 cifs_posix_caps; +} __packed; + +#define CIFS_IOCTL_MAGIC 0xCF +#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) +#define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) +#define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 0a9fb6b53126..6a1119e87fbb 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -394,17 +394,17 @@ cifs_show_options(struct seq_file *s, struct dentry *root) struct sockaddr *srcaddr; srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr; - seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string); + seq_show_option(s, "vers", tcon->ses->server->vals->version_string); cifs_show_security(s, tcon->ses); cifs_show_cache_flavor(s, cifs_sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) seq_puts(s, ",multiuser"); else if (tcon->ses->user_name) - seq_printf(s, ",username=%s", tcon->ses->user_name); + seq_show_option(s, "username", tcon->ses->user_name); if (tcon->ses->domainName) - seq_printf(s, ",domain=%s", tcon->ses->domainName); + seq_show_option(s, "domain", tcon->ses->domainName); if (srcaddr->sa_family != AF_UNSPEC) { struct sockaddr_in *saddr4; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index a782b22904e4..27aea110e923 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.06" +#define CIFS_VERSION "2.07" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 22b289a3b1c4..b406a32deb1f 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -171,6 +171,10 @@ enum smb_version { Smb_21, Smb_30, Smb_302, +#ifdef CONFIG_CIFS_SMB311 + Smb_311, +#endif /* SMB311 */ + Smb_version_err }; struct mid_q_entry; @@ -368,6 +372,8 @@ struct smb_version_operations { void (*new_lease_key)(struct cifs_fid *); int (*generate_signingkey)(struct cifs_ses *); int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *); + int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon, + struct cifsFileInfo *src_file); int (*query_mf_symlink)(unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const unsigned char *, char *, unsigned int *); @@ -386,6 +392,9 @@ struct smb_version_operations { int (*clone_range)(const unsigned int, struct cifsFileInfo *src_file, struct cifsFileInfo *target_file, u64 src_off, u64 len, u64 dest_off); + int (*duplicate_extents)(const unsigned int, struct cifsFileInfo *src, + struct cifsFileInfo *target_file, u64 src_off, u64 len, + u64 dest_off); int (*validate_negotiate)(const unsigned int, struct cifs_tcon *); ssize_t (*query_all_EAs)(const unsigned int, struct cifs_tcon *, const unsigned char *, const unsigned char *, char *, @@ -1617,4 +1626,8 @@ extern struct smb_version_values smb30_values; #define SMB302_VERSION_STRING "3.02" /*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */ extern struct smb_version_values smb302_values; +#define SMB311_VERSION_STRING "3.1.1" +#define ALT_SMB311_VERSION_STRING "3.11" +extern struct smb_version_operations smb311_operations; +extern struct smb_version_values smb311_values; #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 5f9822ac0245..f5b87303ce46 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -2245,6 +2245,20 @@ typedef struct { #define FILE_DEVICE_VIRTUAL_DISK 0x00000024 #define FILE_DEVICE_NETWORK_REDIRECTOR 0x00000028 +/* Device Characteristics */ +#define FILE_REMOVABLE_MEDIA 0x00000001 +#define FILE_READ_ONLY_DEVICE 0x00000002 +#define FILE_FLOPPY_DISKETTE 0x00000004 +#define FILE_WRITE_ONCE_MEDIA 0x00000008 +#define FILE_REMOTE_DEVICE 0x00000010 +#define FILE_DEVICE_IS_MOUNTED 0x00000020 +#define FILE_VIRTUAL_VOLUME 0x00000040 +#define FILE_DEVICE_SECURE_OPEN 0x00000100 +#define FILE_CHARACTERISTIC_TS_DEVICE 0x00001000 +#define FILE_CHARACTERISTIC_WEBDAV_DEVICE 0x00002000 +#define FILE_PORTABLE_DEVICE 0x00004000 +#define FILE_DEVICE_ALLOW_APPCONTAINER_TRAVERSAL 0x00020000 + typedef struct { __le32 DeviceType; __le32 DeviceCharacteristics; @@ -2255,6 +2269,8 @@ typedef struct { /* List of FileSystemAttributes - see 2.5.1 of MS-FSCC */ +#define FILE_SUPPORTS_SPARSE_VDL 0x10000000 /* faster nonsparse extend */ +#define FILE_SUPPORTS_BLOCK_REFCOUNTING 0x08000000 /* allow ioctl dup extents */ #define FILE_SUPPORT_INTEGRITY_STREAMS 0x04000000 #define FILE_SUPPORTS_USN_JOURNAL 0x02000000 #define FILE_SUPPORTS_OPEN_BY_FILE_ID 0x01000000 @@ -2310,6 +2326,16 @@ typedef struct { /* data block encoding of response to level 263 QPathInfo */ char FileName[1]; } __attribute__((packed)) FILE_ALL_INFO; /* level 0x107 QPathInfo */ +typedef struct { + __le64 AllocationSize; + __le64 EndOfFile; /* size ie offset to first free byte in file */ + __le32 NumberOfLinks; /* hard links */ + __u8 DeletePending; + __u8 Directory; + __u16 Pad; +} __attribute__((packed)) FILE_STANDARD_INFO; /* level 0x102 QPathInfo */ + + /* defines for enumerating possible values of the Unix type field below */ #define UNIX_FILE 0 #define UNIX_DIR 1 diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index f26ffbfc64d8..90b4f9f7de66 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -625,9 +625,8 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) server->negflavor = CIFS_NEGFLAVOR_UNENCAP; memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey, CIFS_CRYPTO_KEY_SIZE); - } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC || - server->capabilities & CAP_EXTENDED_SECURITY) && - (pSMBr->EncryptionKeyLength == 0)) { + } else if (pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC || + server->capabilities & CAP_EXTENDED_SECURITY) { server->negflavor = CIFS_NEGFLAVOR_EXTENDED; rc = decode_ext_sec_blob(ses, pSMBr); } else if (server->sec_mode & SECMODE_PW_ENCRYPT) { @@ -697,7 +696,9 @@ cifs_echo_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->callback_data; + mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); + mutex_unlock(&server->srv_mutex); add_credits(server, 1, CIFS_ECHO_OP); } @@ -1573,7 +1574,9 @@ cifs_readv_callback(struct mid_q_entry *mid) } queue_work(cifsiod_wq, &rdata->work); + mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); + mutex_unlock(&server->srv_mutex); add_credits(server, 1, 0); } @@ -2033,6 +2036,7 @@ cifs_writev_callback(struct mid_q_entry *mid) { struct cifs_writedata *wdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; unsigned int written; WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf; @@ -2069,7 +2073,9 @@ cifs_writev_callback(struct mid_q_entry *mid) } queue_work(cifsiod_wq, &wdata->work); + mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); + mutex_unlock(&server->srv_mutex); add_credits(tcon->ses->server, 1, 0); } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 8383d5ea4202..773f4dc77630 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -280,6 +280,11 @@ static const match_table_t cifs_smb_version_tokens = { { Smb_21, SMB21_VERSION_STRING }, { Smb_30, SMB30_VERSION_STRING }, { Smb_302, SMB302_VERSION_STRING }, +#ifdef CONFIG_CIFS_SMB311 + { Smb_311, SMB311_VERSION_STRING }, + { Smb_311, ALT_SMB311_VERSION_STRING }, +#endif /* SMB311 */ + { Smb_version_err, NULL } }; static int ip_connect(struct TCP_Server_Info *server); @@ -1133,6 +1138,12 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol) vol->ops = &smb30_operations; /* currently identical with 3.0 */ vol->vals = &smb302_values; break; +#ifdef CONFIG_CIFS_SMB311 + case Smb_311: + vol->ops = &smb311_operations; + vol->vals = &smb311_values; + break; +#endif /* SMB311 */ #endif default: cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value); @@ -3461,6 +3472,8 @@ try_mount_again: else if (ses) cifs_put_smb_ses(ses); + cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_POSIX_PATHS; + free_xid(xid); } #endif diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 3f50cee79df9..e2a6af1508af 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3216,7 +3216,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_LOCKED; } -static struct vm_operations_struct cifs_file_vm_ops = { +static const struct vm_operations_struct cifs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = cifs_page_mkwrite, diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 8b7898b7670f..c63f5227b681 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -31,12 +31,12 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifsfs.h" - -#define CIFS_IOCTL_MAGIC 0xCF -#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) +#include "cifs_ioctl.h" +#include <linux/btrfs.h> static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, - unsigned long srcfd, u64 off, u64 len, u64 destoff) + unsigned long srcfd, u64 off, u64 len, u64 destoff, + bool dup_extents) { int rc; struct cifsFileInfo *smb_file_target = dst_file->private_data; @@ -109,9 +109,14 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, truncate_inode_pages_range(&target_inode->i_data, destoff, PAGE_CACHE_ALIGN(destoff + len)-1); - if (target_tcon->ses->server->ops->clone_range) + if (dup_extents && target_tcon->ses->server->ops->duplicate_extents) + rc = target_tcon->ses->server->ops->duplicate_extents(xid, + smb_file_src, smb_file_target, off, len, destoff); + else if (!dup_extents && target_tcon->ses->server->ops->clone_range) rc = target_tcon->ses->server->ops->clone_range(xid, smb_file_src, smb_file_target, off, len, destoff); + else + rc = -EOPNOTSUPP; /* force revalidate of size and timestamps of target file now that target is updated on the server */ @@ -127,6 +132,43 @@ out_drop_write: return rc; } +static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon, + void __user *arg) +{ + int rc = 0; + struct smb_mnt_fs_info *fsinf; + + fsinf = kzalloc(sizeof(struct smb_mnt_fs_info), GFP_KERNEL); + if (fsinf == NULL) + return -ENOMEM; + + fsinf->version = 1; + fsinf->protocol_id = tcon->ses->server->vals->protocol_id; + fsinf->device_characteristics = + le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics); + fsinf->device_type = le32_to_cpu(tcon->fsDevInfo.DeviceType); + fsinf->fs_attributes = le32_to_cpu(tcon->fsAttrInfo.Attributes); + fsinf->max_path_component = + le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength); +#ifdef CONFIG_CIFS_SMB2 + fsinf->vol_serial_number = tcon->vol_serial_number; + fsinf->vol_create_time = le64_to_cpu(tcon->vol_create_time); + fsinf->share_flags = tcon->share_flags; + fsinf->share_caps = le32_to_cpu(tcon->capabilities); + fsinf->sector_flags = tcon->ss_flags; + fsinf->optimal_sector_size = tcon->perf_sector_size; + fsinf->max_bytes_chunk = tcon->max_bytes_chunk; + fsinf->maximal_access = tcon->maximal_access; +#endif /* SMB2 */ + fsinf->cifs_posix_caps = le64_to_cpu(tcon->fsUnixInfo.Capability); + + if (copy_to_user(arg, fsinf, sizeof(struct smb_mnt_fs_info))) + rc = -EFAULT; + + kfree(fsinf); + return rc; +} + long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) { struct inode *inode = file_inode(filep); @@ -140,8 +182,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) xid = get_xid(); - cifs_dbg(FYI, "ioctl file %p cmd %u arg %lu\n", filep, command, arg); - cifs_sb = CIFS_SB(inode->i_sb); switch (command) { @@ -205,7 +245,24 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) } break; case CIFS_IOC_COPYCHUNK_FILE: - rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0); + rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, false); + break; + case BTRFS_IOC_CLONE: + rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, true); + break; + case CIFS_IOC_SET_INTEGRITY: + if (pSMBFile == NULL) + break; + tcon = tlink_tcon(pSMBFile->tlink); + if (tcon->ses->server->ops->set_integrity) + rc = tcon->ses->server->ops->set_integrity(xid, + tcon, pSMBFile); + else + rc = -EOPNOTSUPP; + break; + case CIFS_IOC_GET_MNT_INFO: + tcon = tlink_tcon(pSMBFile->tlink); + rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg); break; default: cifs_dbg(FYI, "unsupported ioctl\n"); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 54daee5ad4c1..df91bcf56d67 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -806,6 +806,53 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, cfile->fid.volatile_fid, cfile->pid, &eof, false); } +#ifdef CONFIG_CIFS_SMB311 +static int +smb2_duplicate_extents(const unsigned int xid, + struct cifsFileInfo *srcfile, + struct cifsFileInfo *trgtfile, u64 src_off, + u64 len, u64 dest_off) +{ + int rc; + unsigned int ret_data_len; + char *retbuf = NULL; + struct duplicate_extents_to_file dup_ext_buf; + struct cifs_tcon *tcon = tlink_tcon(trgtfile->tlink); + + /* server fileays advertise duplicate extent support with this flag */ + if ((le32_to_cpu(tcon->fsAttrInfo.Attributes) & + FILE_SUPPORTS_BLOCK_REFCOUNTING) == 0) + return -EOPNOTSUPP; + + dup_ext_buf.VolatileFileHandle = srcfile->fid.volatile_fid; + dup_ext_buf.PersistentFileHandle = srcfile->fid.persistent_fid; + dup_ext_buf.SourceFileOffset = cpu_to_le64(src_off); + dup_ext_buf.TargetFileOffset = cpu_to_le64(dest_off); + dup_ext_buf.ByteCount = cpu_to_le64(len); + cifs_dbg(FYI, "duplicate extents: src off %lld dst off %lld len %lld", + src_off, dest_off, len); + + rc = smb2_set_file_size(xid, tcon, trgtfile, dest_off + len, false); + if (rc) + goto duplicate_extents_out; + + rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, + trgtfile->fid.volatile_fid, + FSCTL_DUPLICATE_EXTENTS_TO_FILE, + true /* is_fsctl */, (char *)&dup_ext_buf, + sizeof(struct duplicate_extents_to_file), + (char **)&retbuf, + &ret_data_len); + + if (ret_data_len > 0) + cifs_dbg(FYI, "non-zero response length in duplicate extents"); + +duplicate_extents_out: + return rc; +} +#endif /* CONFIG_CIFS_SMB311 */ + + static int smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *cfile) @@ -815,6 +862,28 @@ smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, } static int +smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile) +{ + struct fsctl_set_integrity_information_req integr_info; + char *retbuf = NULL; + unsigned int ret_data_len; + + integr_info.ChecksumAlgorithm = cpu_to_le16(CHECKSUM_TYPE_UNCHANGED); + integr_info.Flags = 0; + integr_info.Reserved = 0; + + return SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + FSCTL_SET_INTEGRITY_INFORMATION, + true /* is_fsctl */, (char *)&integr_info, + sizeof(struct fsctl_set_integrity_information_req), + (char **)&retbuf, + &ret_data_len); + +} + +static int smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, const char *path, struct cifs_sb_info *cifs_sb, struct cifs_fid *fid, __u16 search_flags, @@ -1624,6 +1693,7 @@ struct smb_version_operations smb30_operations = { .new_lease_key = smb2_new_lease_key, .generate_signingkey = generate_smb3signingkey, .calc_signature = smb3_calc_signature, + .set_integrity = smb3_set_integrity, .is_read_op = smb21_is_read_op, .set_oplock_level = smb3_set_oplock_level, .create_lease_buf = smb3_create_lease_buf, @@ -1635,6 +1705,94 @@ struct smb_version_operations smb30_operations = { .fallocate = smb3_fallocate, }; +#ifdef CONFIG_CIFS_SMB311 +struct smb_version_operations smb311_operations = { + .compare_fids = smb2_compare_fids, + .setup_request = smb2_setup_request, + .setup_async_request = smb2_setup_async_request, + .check_receive = smb2_check_receive, + .add_credits = smb2_add_credits, + .set_credits = smb2_set_credits, + .get_credits_field = smb2_get_credits_field, + .get_credits = smb2_get_credits, + .wait_mtu_credits = smb2_wait_mtu_credits, + .get_next_mid = smb2_get_next_mid, + .read_data_offset = smb2_read_data_offset, + .read_data_length = smb2_read_data_length, + .map_error = map_smb2_to_linux_error, + .find_mid = smb2_find_mid, + .check_message = smb2_check_message, + .dump_detail = smb2_dump_detail, + .clear_stats = smb2_clear_stats, + .print_stats = smb2_print_stats, + .dump_share_caps = smb2_dump_share_caps, + .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, + .need_neg = smb2_need_neg, + .negotiate = smb2_negotiate, + .negotiate_wsize = smb2_negotiate_wsize, + .negotiate_rsize = smb2_negotiate_rsize, + .sess_setup = SMB2_sess_setup, + .logoff = SMB2_logoff, + .tree_connect = SMB2_tcon, + .tree_disconnect = SMB2_tdis, + .qfs_tcon = smb3_qfs_tcon, + .is_path_accessible = smb2_is_path_accessible, + .can_echo = smb2_can_echo, + .echo = SMB2_echo, + .query_path_info = smb2_query_path_info, + .get_srv_inum = smb2_get_srv_inum, + .query_file_info = smb2_query_file_info, + .set_path_size = smb2_set_path_size, + .set_file_size = smb2_set_file_size, + .set_file_info = smb2_set_file_info, + .set_compression = smb2_set_compression, + .mkdir = smb2_mkdir, + .mkdir_setinfo = smb2_mkdir_setinfo, + .rmdir = smb2_rmdir, + .unlink = smb2_unlink, + .rename = smb2_rename_path, + .create_hardlink = smb2_create_hardlink, + .query_symlink = smb2_query_symlink, + .query_mf_symlink = smb3_query_mf_symlink, + .create_mf_symlink = smb3_create_mf_symlink, + .open = smb2_open_file, + .set_fid = smb2_set_fid, + .close = smb2_close_file, + .flush = smb2_flush_file, + .async_readv = smb2_async_readv, + .async_writev = smb2_async_writev, + .sync_read = smb2_sync_read, + .sync_write = smb2_sync_write, + .query_dir_first = smb2_query_dir_first, + .query_dir_next = smb2_query_dir_next, + .close_dir = smb2_close_dir, + .calc_smb_size = smb2_calc_size, + .is_status_pending = smb2_is_status_pending, + .oplock_response = smb2_oplock_response, + .queryfs = smb2_queryfs, + .mand_lock = smb2_mand_lock, + .mand_unlock_range = smb2_unlock_range, + .push_mand_locks = smb2_push_mandatory_locks, + .get_lease_key = smb2_get_lease_key, + .set_lease_key = smb2_set_lease_key, + .new_lease_key = smb2_new_lease_key, + .generate_signingkey = generate_smb3signingkey, + .calc_signature = smb3_calc_signature, + .set_integrity = smb3_set_integrity, + .is_read_op = smb21_is_read_op, + .set_oplock_level = smb3_set_oplock_level, + .create_lease_buf = smb3_create_lease_buf, + .parse_lease_buf = smb3_parse_lease_buf, + .clone_range = smb2_clone_range, + .duplicate_extents = smb2_duplicate_extents, +/* .validate_negotiate = smb3_validate_negotiate, */ /* not used in 3.11 */ + .wp_retry_size = smb2_wp_retry_size, + .dir_needs_close = smb2_dir_needs_close, + .fallocate = smb3_fallocate, +}; +#endif /* CIFS_SMB311 */ + struct smb_version_values smb20_values = { .version_string = SMB20_VERSION_STRING, .protocol_id = SMB20_PROT_ID, @@ -1714,3 +1872,25 @@ struct smb_version_values smb302_values = { .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, .create_lease_size = sizeof(struct create_lease_v2), }; + +#ifdef CONFIG_CIFS_SMB311 +struct smb_version_values smb311_values = { + .version_string = SMB311_VERSION_STRING, + .protocol_id = SMB311_PROT_ID, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU, + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, + .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, + .header_size = sizeof(struct smb2_hdr), + .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .lock_cmd = SMB2_LOCK, + .cap_unix = 0, + .cap_nt_find = SMB2_NT_FIND, + .cap_large_files = SMB2_LARGE_FILES, + .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, + .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, + .create_lease_size = sizeof(struct create_lease_v2), +}; +#endif /* SMB311 */ diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 54cbe19d9c08..070fb2ad85ce 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -304,6 +304,59 @@ small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon, return rc; } +#ifdef CONFIG_CIFS_SMB311 +/* offset is sizeof smb2_negotiate_req - 4 but rounded up to 8 bytes */ +#define OFFSET_OF_NEG_CONTEXT 0x68 /* sizeof(struct smb2_negotiate_req) - 4 */ + + +#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1) +#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2) + +static void +build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt) +{ + pneg_ctxt->ContextType = SMB2_PREAUTH_INTEGRITY_CAPABILITIES; + pneg_ctxt->DataLength = cpu_to_le16(38); + pneg_ctxt->HashAlgorithmCount = cpu_to_le16(1); + pneg_ctxt->SaltLength = cpu_to_le16(SMB311_SALT_SIZE); + get_random_bytes(pneg_ctxt->Salt, SMB311_SALT_SIZE); + pneg_ctxt->HashAlgorithms = SMB2_PREAUTH_INTEGRITY_SHA512; +} + +static void +build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt) +{ + pneg_ctxt->ContextType = SMB2_ENCRYPTION_CAPABILITIES; + pneg_ctxt->DataLength = cpu_to_le16(6); + pneg_ctxt->CipherCount = cpu_to_le16(2); + pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM; + pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES128_CCM; +} + +static void +assemble_neg_contexts(struct smb2_negotiate_req *req) +{ + + /* +4 is to account for the RFC1001 len field */ + char *pneg_ctxt = (char *)req + OFFSET_OF_NEG_CONTEXT + 4; + + build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt); + /* Add 2 to size to round to 8 byte boundary */ + pneg_ctxt += 2 + sizeof(struct smb2_preauth_neg_context); + build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt); + req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT); + req->NegotiateContextCount = cpu_to_le16(2); + inc_rfc1001_len(req, 4 + sizeof(struct smb2_preauth_neg_context) + 2 + + sizeof(struct smb2_encryption_neg_context)); /* calculate hash */ +} +#else +static void assemble_neg_contexts(struct smb2_negotiate_req *req) +{ + return; +} +#endif /* SMB311 */ + + /* * * SMB2 Worker functions follow: @@ -363,10 +416,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) /* ClientGUID must be zero for SMB2.02 dialect */ if (ses->server->vals->protocol_id == SMB20_PROT_ID) memset(req->ClientGUID, 0, SMB2_CLIENT_GUID_SIZE); - else + else { memcpy(req->ClientGUID, server->client_guid, SMB2_CLIENT_GUID_SIZE); - + if (ses->server->vals->protocol_id == SMB311_PROT_ID) + assemble_neg_contexts(req); + } iov[0].iov_base = (char *)req; /* 4 for rfc1002 length field */ iov[0].iov_len = get_rfc1002_length(req) + 4; @@ -393,8 +448,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) cifs_dbg(FYI, "negotiated smb3.0 dialect\n"); else if (rsp->DialectRevision == cpu_to_le16(SMB302_PROT_ID)) cifs_dbg(FYI, "negotiated smb3.02 dialect\n"); +#ifdef CONFIG_CIFS_SMB311 + else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) + cifs_dbg(FYI, "negotiated smb3.1.1 dialect\n"); +#endif /* SMB311 */ else { - cifs_dbg(VFS, "Illegal dialect returned by server %d\n", + cifs_dbg(VFS, "Illegal dialect returned by server 0x%x\n", le16_to_cpu(rsp->DialectRevision)); rc = -EIO; goto neg_exit; @@ -572,7 +631,7 @@ ssetup_ntlmssp_authenticate: return rc; req->hdr.SessionId = 0; /* First session, not a reauthenticate */ - req->VcNumber = 0; /* MBZ */ + req->Flags = 0; /* MBZ */ /* to enable echos and oplocks */ req->hdr.CreditRequest = cpu_to_le16(3); @@ -1567,7 +1626,9 @@ smb2_echo_callback(struct mid_q_entry *mid) if (mid->mid_state == MID_RESPONSE_RECEIVED) credits_received = le16_to_cpu(smb2->hdr.CreditRequest); + mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); + mutex_unlock(&server->srv_mutex); add_credits(server, credits_received, CIFS_ECHO_OP); } @@ -1751,7 +1812,9 @@ smb2_readv_callback(struct mid_q_entry *mid) cifs_stats_fail_inc(tcon, SMB2_READ_HE); queue_work(cifsiod_wq, &rdata->work); + mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); + mutex_unlock(&server->srv_mutex); add_credits(server, credits_received, 0); } @@ -1879,6 +1942,7 @@ smb2_writev_callback(struct mid_q_entry *mid) { struct cifs_writedata *wdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; unsigned int written; struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf; unsigned int credits_received = 1; @@ -1918,7 +1982,9 @@ smb2_writev_callback(struct mid_q_entry *mid) cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); queue_work(cifsiod_wq, &wdata->work); + mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); + mutex_unlock(&server->srv_mutex); add_credits(tcon->ses->server, credits_received, 0); } diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 70867d54fb8b..451108284a2f 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -136,9 +136,6 @@ struct smb2_transform_hdr { __u64 SessionId; } __packed; -/* Encryption Algorithms */ -#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) - /* * SMB2 flag definitions */ @@ -191,7 +188,10 @@ struct smb2_negotiate_req { __le16 Reserved; /* MBZ */ __le32 Capabilities; __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE]; - __le64 ClientStartTime; /* MBZ */ + /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */ + __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */ + __le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */ + __le16 Reserved2; __le16 Dialects[1]; /* One dialect (vers=) at a time for now */ } __packed; @@ -200,6 +200,7 @@ struct smb2_negotiate_req { #define SMB21_PROT_ID 0x0210 #define SMB30_PROT_ID 0x0300 #define SMB302_PROT_ID 0x0302 +#define SMB311_PROT_ID 0x0311 #define BAD_PROT_ID 0xFFFF /* SecurityMode flags */ @@ -217,12 +218,38 @@ struct smb2_negotiate_req { #define SMB2_NT_FIND 0x00100000 #define SMB2_LARGE_FILES 0x00200000 +#define SMB311_SALT_SIZE 32 +/* Hash Algorithm Types */ +#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001) + +struct smb2_preauth_neg_context { + __le16 ContextType; /* 1 */ + __le16 DataLength; + __le32 Reserved; + __le16 HashAlgorithmCount; /* 1 */ + __le16 SaltLength; + __le16 HashAlgorithms; /* HashAlgorithms[0] since only one defined */ + __u8 Salt[SMB311_SALT_SIZE]; +} __packed; + +/* Encryption Algorithms Ciphers */ +#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) +#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002) + +struct smb2_encryption_neg_context { + __le16 ContextType; /* 2 */ + __le16 DataLength; + __le32 Reserved; + __le16 CipherCount; /* AES-128-GCM and AES-128-CCM */ + __le16 Ciphers[2]; /* Ciphers[0] since only one used now */ +} __packed; + struct smb2_negotiate_rsp { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 65 */ __le16 SecurityMode; __le16 DialectRevision; - __le16 Reserved; /* MBZ */ + __le16 NegotiateContextCount; /* Prior to SMB3.1.1 was Reserved & MBZ */ __u8 ServerGUID[16]; __le32 Capabilities; __le32 MaxTransactSize; @@ -232,14 +259,18 @@ struct smb2_negotiate_rsp { __le64 ServerStartTime; __le16 SecurityBufferOffset; __le16 SecurityBufferLength; - __le32 Reserved2; /* may be any value, ignore */ + __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */ __u8 Buffer[1]; /* variable length GSS security buffer */ } __packed; +/* Flags */ +#define SMB2_SESSION_REQ_FLAG_BINDING 0x01 +#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04 + struct smb2_sess_setup_req { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 25 */ - __u8 VcNumber; + __u8 Flags; __u8 SecurityMode; __le32 Capabilities; __le32 Channel; @@ -274,10 +305,13 @@ struct smb2_logoff_rsp { __le16 Reserved; } __packed; +/* Flags/Reserved for SMB3.1.1 */ +#define SMB2_SHAREFLAG_CLUSTER_RECONNECT 0x0001 + struct smb2_tree_connect_req { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 9 */ - __le16 Reserved; + __le16 Reserved; /* Flags in SMB3.1.1 */ __le16 PathOffset; __le16 PathLength; __u8 Buffer[1]; /* variable length */ @@ -587,6 +621,29 @@ struct copychunk_ioctl_rsp { __le32 TotalBytesWritten; } __packed; +struct fsctl_set_integrity_information_req { + __le16 ChecksumAlgorithm; + __le16 Reserved; + __le32 Flags; +} __packed; + +struct fsctl_get_integrity_information_rsp { + __le16 ChecksumAlgorithm; + __le16 Reserved; + __le32 Flags; + __le32 ChecksumChunkSizeInBytes; + __le32 ClusterSizeInBytes; +} __packed; + +/* Integrity ChecksumAlgorithm choices for above */ +#define CHECKSUM_TYPE_NONE 0x0000 +#define CHECKSUM_TYPE_CRC64 0x0002 +#define CHECKSUM_TYPE_UNCHANGED 0xFFFF /* set only */ + +/* Integrity flags for above */ +#define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001 + + struct validate_negotiate_info_req { __le32 Capabilities; __u8 Guid[SMB2_CLIENT_GUID_SIZE]; @@ -620,6 +677,14 @@ struct compress_ioctl { __le16 CompressionState; /* See cifspdu.h for possible flag values */ } __packed; +struct duplicate_extents_to_file { + __u64 PersistentFileHandle; /* source file handle, opaque endianness */ + __u64 VolatileFileHandle; + __le64 SourceFileOffset; + __le64 TargetFileOffset; + __le64 ByteCount; /* Bytes to be copied */ +} __packed; + struct smb2_ioctl_req { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 57 */ diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index 83efa59535be..a639d0dab453 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -75,10 +75,13 @@ #define FSCTL_QUERY_SPARING_INFO 0x00090138 /* BB add struct */ #define FSCTL_SET_ZERO_ON_DEALLOC 0x00090194 /* BB add struct */ #define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */ +#define FSCTL_GET_INTEGRITY_INFORMATION 0x0009027C #define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */ #define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */ #define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */ +#define FSCTL_DUPLICATE_EXTENTS_TO_FILE 0x00098344 #define FSCTL_SIS_LINK_FILES 0x0009C104 +#define FSCTL_SET_INTEGRITY_INFORMATION 0x0009C280 #define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */ #define FSCTL_PIPE_TRANSCEIVE 0x0011C017 /* BB add struct */ /* strange that the number for this op is not sequential with previous op */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 126f46b887cc..2a24c524fb9a 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -644,7 +644,9 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) } spin_unlock(&GlobalMid_Lock); + mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); + mutex_unlock(&server->srv_mutex); return rc; } diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index d6f7a76a1f5b..f829fe963f5b 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -79,7 +79,7 @@ void coda_sysctl_clean(void); static inline struct coda_inode_info *ITOC(struct inode *inode) { - return list_entry(inode, struct coda_inode_info, vfs_inode); + return container_of(inode, struct coda_inode_info, vfs_inode); } static __inline__ struct CodaFid *coda_i2f(struct inode *inode) diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 9b1ffaa0572e..f6c6c8adbc01 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -353,7 +353,7 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid, char *result; insize = max_t(unsigned int, - INSIZE(readlink), OUTSIZE(readlink)+ *length + 1); + INSIZE(readlink), OUTSIZE(readlink)+ *length); UPARG(CODA_READLINK); inp->coda_readlink.VFid = *fid; @@ -361,8 +361,8 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid, error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); if (!error) { retlen = outp->coda_readlink.count; - if ( retlen > *length ) - retlen = *length; + if (retlen >= *length) + retlen = *length - 1; *length = retlen; result = (char *)outp + (long)outp->coda_readlink.data; memcpy(buffer, result, retlen); diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 6b8e2f091f5b..48851f6ea6ec 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -896,6 +896,7 @@ COMPATIBLE_IOCTL(FIGETBSZ) /* 'X' - originally XFS but some now in the VFS */ COMPATIBLE_IOCTL(FIFREEZE) COMPATIBLE_IOCTL(FITHAW) +COMPATIBLE_IOCTL(FITRIM) COMPATIBLE_IOCTL(KDGETKEYCODE) COMPATIBLE_IOCTL(KDSETKEYCODE) COMPATIBLE_IOCTL(KDGKBTYPE) diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 8d89f5fd0331..eae87575e681 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -236,7 +236,7 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) if (dentry) { spin_lock(&dentry->d_lock); - if (!d_unhashed(dentry) && d_really_is_positive(dentry)) { + if (simple_positive(dentry)) { dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 4d6a30e76168..b863a09cd2f1 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -115,7 +115,7 @@ void config_item_init_type_name(struct config_item *item, const char *name, struct config_item_type *type) { - config_item_set_name(item, name); + config_item_set_name(item, "%s", name); item->ci_type = type; config_item_init(item); } @@ -124,7 +124,7 @@ EXPORT_SYMBOL(config_item_init_type_name); void config_group_init_type_name(struct config_group *group, const char *name, struct config_item_type *type) { - config_item_set_name(&group->cg_item, name); + config_item_set_name(&group->cg_item, "%s", name); group->cg_item.ci_type = type; config_group_init(group); } diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 537356742091..a8f3b589a2df 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -129,8 +129,6 @@ void configfs_release_fs(void) } -static struct kobject *config_kobj; - static int __init configfs_init(void) { int err = -ENOMEM; @@ -141,8 +139,8 @@ static int __init configfs_init(void) if (!configfs_dir_cachep) goto out; - config_kobj = kobject_create_and_add("config", kernel_kobj); - if (!config_kobj) + err = sysfs_create_mount_point(kernel_kobj, "config"); + if (err) goto out2; err = register_filesystem(&configfs_fs_type); @@ -152,7 +150,7 @@ static int __init configfs_init(void) return 0; out3: pr_err("Unable to register filesystem!\n"); - kobject_put(config_kobj); + sysfs_remove_mount_point(kernel_kobj, "config"); out2: kmem_cache_destroy(configfs_dir_cachep); configfs_dir_cachep = NULL; @@ -163,7 +161,7 @@ out: static void __exit configfs_exit(void) { unregister_filesystem(&configfs_fs_type); - kobject_put(config_kobj); + sysfs_remove_mount_point(kernel_kobj, "config"); kmem_cache_destroy(configfs_dir_cachep); configfs_dir_cachep = NULL; } diff --git a/fs/coredump.c b/fs/coredump.c index e52e0064feac..a8f75640ac86 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -140,7 +140,7 @@ static int cn_print_exe_file(struct core_name *cn) goto put_exe_file; } - path = d_path(&exe_file->f_path, pathbuf, PATH_MAX); + path = file_path(exe_file, pathbuf, PATH_MAX); if (IS_ERR(path)) { ret = PTR_ERR(path); goto free_buf; @@ -513,10 +513,10 @@ void do_coredump(const siginfo_t *siginfo) const struct cred *old_cred; struct cred *cred; int retval = 0; - int flag = 0; int ispipe; struct files_struct *displaced; - bool need_nonrelative = false; + /* require nonrelative corefile path and be extra careful */ + bool need_suid_safe = false; bool core_dumped = false; static atomic_t core_dump_count = ATOMIC_INIT(0); struct coredump_params cprm = { @@ -550,9 +550,8 @@ void do_coredump(const siginfo_t *siginfo) */ if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) { /* Setuid core dump mode */ - flag = O_EXCL; /* Stop rewrite attacks */ cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */ - need_nonrelative = true; + need_suid_safe = true; } retval = coredump_wait(siginfo->si_signo, &core_state); @@ -633,7 +632,7 @@ void do_coredump(const siginfo_t *siginfo) if (cprm.limit < binfmt->min_coredump) goto fail_unlock; - if (need_nonrelative && cn.corename[0] != '/') { + if (need_suid_safe && cn.corename[0] != '/') { printk(KERN_WARNING "Pid %d(%s) can only dump core "\ "to fully qualified path!\n", task_tgid_vnr(current), current->comm); @@ -641,8 +640,35 @@ void do_coredump(const siginfo_t *siginfo) goto fail_unlock; } + /* + * Unlink the file if it exists unless this is a SUID + * binary - in that case, we're running around with root + * privs and don't want to unlink another user's coredump. + */ + if (!need_suid_safe) { + mm_segment_t old_fs; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + /* + * If it doesn't exist, that's fine. If there's some + * other problem, we'll catch it at the filp_open(). + */ + (void) sys_unlink((const char __user *)cn.corename); + set_fs(old_fs); + } + + /* + * There is a race between unlinking and creating the + * file, but if that causes an EEXIST here, that's + * fine - another process raced with us while creating + * the corefile, and the other process won. To userspace, + * what matters is that at least one of the two processes + * writes its coredump successfully, not which one. + */ cprm.file = filp_open(cn.corename, - O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, + O_CREAT | 2 | O_NOFOLLOW | + O_LARGEFILE | O_EXCL, 0600); if (IS_ERR(cprm.file)) goto fail_unlock; @@ -659,11 +685,15 @@ void do_coredump(const siginfo_t *siginfo) if (!S_ISREG(inode->i_mode)) goto close_fail; /* - * Dont allow local users get cute and trick others to coredump - * into their pre-created files. + * Don't dump core if the filesystem changed owner or mode + * of the file during file creation. This is an issue when + * a process dumps core while its cwd is e.g. on a vfat + * filesystem. */ if (!uid_eq(inode->i_uid, current_fsuid())) goto close_fail; + if ((inode->i_mode & 0677) != 0600) + goto close_fail; if (!(cprm.file->f_mode & FMODE_CAN_WRITE)) goto close_fail; if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file)) @@ -17,12 +17,14 @@ #include <linux/atomic.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> +#include <linux/dax.h> #include <linux/fs.h> #include <linux/genhd.h> #include <linux/highmem.h> #include <linux/memcontrol.h> #include <linux/mm.h> #include <linux/mutex.h> +#include <linux/pmem.h> #include <linux/sched.h> #include <linux/uio.h> #include <linux/vmstat.h> @@ -34,7 +36,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size) might_sleep(); do { - void *addr; + void __pmem *addr; unsigned long pfn; long count; @@ -46,10 +48,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size) unsigned pgsz = PAGE_SIZE - offset_in_page(addr); if (pgsz > count) pgsz = count; - if (pgsz < PAGE_SIZE) - memset(addr, 0, pgsz); - else - clear_page(addr); + clear_pmem(addr, pgsz); addr += pgsz; size -= pgsz; count -= pgsz; @@ -59,26 +58,29 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size) } } while (size); + wmb_pmem(); return 0; } EXPORT_SYMBOL_GPL(dax_clear_blocks); -static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits) +static long dax_get_addr(struct buffer_head *bh, void __pmem **addr, + unsigned blkbits) { unsigned long pfn; sector_t sector = bh->b_blocknr << (blkbits - 9); return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); } -static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos, - loff_t end) +/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ +static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, + loff_t pos, loff_t end) { loff_t final = end - pos + first; /* The final byte of the buffer */ if (first > 0) - memset(addr, 0, first); + clear_pmem(addr, first); if (final < size) - memset(addr + final, 0, size - final); + clear_pmem(addr + final, size - final); } static bool buffer_written(struct buffer_head *bh) @@ -106,14 +108,15 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, loff_t pos = start; loff_t max = start; loff_t bh_max = start; - void *addr; + void __pmem *addr; bool hole = false; + bool need_wmb = false; if (iov_iter_rw(iter) != WRITE) end = min(end, i_size_read(inode)); while (pos < end) { - unsigned len; + size_t len; if (pos == max) { unsigned blkbits = inode->i_blkbits; sector_t block = pos >> blkbits; @@ -145,19 +148,23 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, retval = dax_get_addr(bh, &addr, blkbits); if (retval < 0) break; - if (buffer_unwritten(bh) || buffer_new(bh)) + if (buffer_unwritten(bh) || buffer_new(bh)) { dax_new_buf(addr, retval, first, pos, end); + need_wmb = true; + } addr += first; size = retval - first; } max = min(pos + size, end); } - if (iov_iter_rw(iter) == WRITE) - len = copy_from_iter(addr, max - pos, iter); - else if (!hole) - len = copy_to_iter(addr, max - pos, iter); + if (iov_iter_rw(iter) == WRITE) { + len = copy_from_iter_pmem(addr, max - pos, iter); + need_wmb = true; + } else if (!hole) + len = copy_to_iter((void __force *)addr, max - pos, + iter); else len = iov_iter_zero(max - pos, iter); @@ -168,6 +175,9 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, addr += len; } + if (need_wmb) + wmb_pmem(); + return (pos == start) ? retval : pos - start; } @@ -209,7 +219,8 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, } /* Protects against truncate */ - inode_dio_begin(inode); + if (!(flags & DIO_SKIP_DIO_COUNT)) + inode_dio_begin(inode); retval = dax_io(inode, iter, pos, end, get_block, &bh); @@ -219,7 +230,8 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, if ((retval > 0) && end_io) end_io(iocb, pos, retval, bh.b_private); - inode_dio_end(inode); + if (!(flags & DIO_SKIP_DIO_COUNT)) + inode_dio_end(inode); out: return retval; } @@ -258,11 +270,13 @@ static int dax_load_hole(struct address_space *mapping, struct page *page, static int copy_user_bh(struct page *to, struct buffer_head *bh, unsigned blkbits, unsigned long vaddr) { - void *vfrom, *vto; + void __pmem *vfrom; + void *vto; + if (dax_get_addr(bh, &vfrom, blkbits) < 0) return -EIO; vto = kmap_atomic(to); - copy_user_page(vto, vfrom, vaddr, to); + copy_user_page(vto, (void __force *)vfrom, vaddr, to); kunmap_atomic(vto); return 0; } @@ -270,16 +284,13 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh, static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { - struct address_space *mapping = inode->i_mapping; sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); unsigned long vaddr = (unsigned long)vmf->virtual_address; - void *addr; + void __pmem *addr; unsigned long pfn; pgoff_t size; int error; - i_mmap_lock_read(mapping); - /* * Check truncate didn't happen while we were allocating a block. * If it did, this block may or may not be still allocated to the @@ -301,22 +312,35 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, goto out; } - if (buffer_unwritten(bh) || buffer_new(bh)) - clear_page(addr); + if (buffer_unwritten(bh) || buffer_new(bh)) { + clear_pmem(addr, PAGE_SIZE); + wmb_pmem(); + } error = vm_insert_mixed(vma, vaddr, pfn); out: - i_mmap_unlock_read(mapping); - - if (bh->b_end_io) - bh->b_end_io(bh, 1); - return error; } -static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block) +/** + * __dax_fault - handle a page fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @get_block: The filesystem method used to translate file offsets to blocks + * @complete_unwritten: The filesystem method used to convert unwritten blocks + * to written so the data written to them is exposed. This is required for + * required by write faults for filesystems that will return unwritten + * extent mappings from @get_block, but it is optional for reads as + * dax_insert_mapping() will always zero unwritten blocks. If the fs does + * not support unwritten extents, the it should pass NULL. + * + * When a page fault occurs, filesystems may call this helper in their + * fault handler for DAX files. __dax_fault() assumes the caller has done all + * the necessary locking for the page fault to proceed successfully. + */ +int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block, dax_iodone_t complete_unwritten) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; @@ -357,15 +381,17 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, * from a read fault and we've raced with a truncate */ error = -EIO; - goto unlock_page; + goto unlock; } + } else { + i_mmap_lock_write(mapping); } error = get_block(inode, block, &bh, 0); if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; /* fs corruption? */ if (error) - goto unlock_page; + goto unlock; if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { if (vmf->flags & FAULT_FLAG_WRITE) { @@ -376,8 +402,9 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; if (error) - goto unlock_page; + goto unlock; } else { + i_mmap_unlock_write(mapping); return dax_load_hole(mapping, page, vmf); } } @@ -389,17 +416,15 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, else clear_user_highpage(new_page, vaddr); if (error) - goto unlock_page; + goto unlock; vmf->page = page; if (!page) { - i_mmap_lock_read(mapping); /* Check we didn't race with truncate */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) { - i_mmap_unlock_read(mapping); error = -EIO; - goto out; + goto unlock; } } return VM_FAULT_LOCKED; @@ -417,8 +442,26 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, page_cache_release(page); } + /* + * If we successfully insert the new mapping over an unwritten extent, + * we need to ensure we convert the unwritten extent. If there is an + * error inserting the mapping, the filesystem needs to leave it as + * unwritten to prevent exposure of the stale underlying data to + * userspace, but we still need to call the completion function so + * the private resources on the mapping buffer can be released. We + * indicate what the callback should do via the uptodate variable, same + * as for normal BH based IO completions. + */ error = dax_insert_mapping(inode, &bh, vma, vmf); + if (buffer_unwritten(&bh)) { + if (complete_unwritten) + complete_unwritten(&bh, !error); + else + WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); + } + if (!page) + i_mmap_unlock_write(mapping); out: if (error == -ENOMEM) return VM_FAULT_OOM | major; @@ -427,13 +470,17 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; - unlock_page: + unlock: if (page) { unlock_page(page); page_cache_release(page); + } else { + i_mmap_unlock_write(mapping); } + goto out; } +EXPORT_SYMBOL(__dax_fault); /** * dax_fault - handle a page fault on a DAX file @@ -445,7 +492,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, * fault handler for DAX files. */ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block) + get_block_t get_block, dax_iodone_t complete_unwritten) { int result; struct super_block *sb = file_inode(vma->vm_file)->i_sb; @@ -454,7 +501,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, sb_start_pagefault(sb); file_update_time(vma->vm_file); } - result = do_dax_fault(vma, vmf, get_block); + result = __dax_fault(vma, vmf, get_block, complete_unwritten); if (vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(sb); @@ -462,6 +509,177 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } EXPORT_SYMBOL_GPL(dax_fault); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * The 'colour' (ie low bits) within a PMD of a page offset. This comes up + * more often than one might expect in the below function. + */ +#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) + +int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, unsigned int flags, get_block_t get_block, + dax_iodone_t complete_unwritten) +{ + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct buffer_head bh; + unsigned blkbits = inode->i_blkbits; + unsigned long pmd_addr = address & PMD_MASK; + bool write = flags & FAULT_FLAG_WRITE; + long length; + void __pmem *kaddr; + pgoff_t size, pgoff; + sector_t block, sector; + unsigned long pfn; + int result = 0; + + /* Fall back to PTEs if we're going to COW */ + if (write && !(vma->vm_flags & VM_SHARED)) + return VM_FAULT_FALLBACK; + /* If the PMD would extend outside the VMA */ + if (pmd_addr < vma->vm_start) + return VM_FAULT_FALLBACK; + if ((pmd_addr + PMD_SIZE) > vma->vm_end) + return VM_FAULT_FALLBACK; + + pgoff = linear_page_index(vma, pmd_addr); + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (pgoff >= size) + return VM_FAULT_SIGBUS; + /* If the PMD would cover blocks out of the file */ + if ((pgoff | PG_PMD_COLOUR) >= size) + return VM_FAULT_FALLBACK; + + memset(&bh, 0, sizeof(bh)); + block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); + + bh.b_size = PMD_SIZE; + i_mmap_lock_write(mapping); + length = get_block(inode, block, &bh, write); + if (length) + return VM_FAULT_SIGBUS; + + /* + * If the filesystem isn't willing to tell us the length of a hole, + * just fall back to PTEs. Calling get_block 512 times in a loop + * would be silly. + */ + if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) + goto fallback; + + if (buffer_unwritten(&bh) || buffer_new(&bh)) { + int i; + for (i = 0; i < PTRS_PER_PMD; i++) + clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE); + wmb_pmem(); + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + result |= VM_FAULT_MAJOR; + } + + /* + * If we allocated new storage, make sure no process has any + * zero pages covering this hole + */ + if (buffer_new(&bh)) { + i_mmap_unlock_write(mapping); + unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); + i_mmap_lock_write(mapping); + } + + /* + * If a truncate happened while we were allocating blocks, we may + * leave blocks allocated to the file that are beyond EOF. We can't + * take i_mutex here, so just leave them hanging; they'll be freed + * when the file is deleted. + */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (pgoff >= size) { + result = VM_FAULT_SIGBUS; + goto out; + } + if ((pgoff | PG_PMD_COLOUR) >= size) + goto fallback; + + if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { + spinlock_t *ptl; + pmd_t entry; + struct page *zero_page = get_huge_zero_page(); + + if (unlikely(!zero_page)) + goto fallback; + + ptl = pmd_lock(vma->vm_mm, pmd); + if (!pmd_none(*pmd)) { + spin_unlock(ptl); + goto fallback; + } + + entry = mk_pmd(zero_page, vma->vm_page_prot); + entry = pmd_mkhuge(entry); + set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); + result = VM_FAULT_NOPAGE; + spin_unlock(ptl); + } else { + sector = bh.b_blocknr << (blkbits - 9); + length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn, + bh.b_size); + if (length < 0) { + result = VM_FAULT_SIGBUS; + goto out; + } + if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR)) + goto fallback; + + result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write); + } + + out: + if (buffer_unwritten(&bh)) + complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); + + i_mmap_unlock_write(mapping); + + return result; + + fallback: + count_vm_event(THP_FAULT_FALLBACK); + result = VM_FAULT_FALLBACK; + goto out; +} +EXPORT_SYMBOL_GPL(__dax_pmd_fault); + +/** + * dax_pmd_fault - handle a PMD fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @get_block: The filesystem method used to translate file offsets to blocks + * + * When a page fault occurs, filesystems may call this helper in their + * pmd_fault handler for DAX files. + */ +int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, unsigned int flags, get_block_t get_block, + dax_iodone_t complete_unwritten) +{ + int result; + struct super_block *sb = file_inode(vma->vm_file)->i_sb; + + if (flags & FAULT_FLAG_WRITE) { + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + } + result = __dax_pmd_fault(vma, address, pmd, flags, get_block, + complete_unwritten); + if (flags & FAULT_FLAG_WRITE) + sb_end_pagefault(sb); + + return result; +} +EXPORT_SYMBOL_GPL(dax_pmd_fault); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + /** * dax_pfn_mkwrite - handle first write to DAX page * @vma: The virtual memory area where the fault occurred @@ -516,11 +734,12 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, if (err < 0) return err; if (buffer_written(&bh)) { - void *addr; + void __pmem *addr; err = dax_get_addr(&bh, &addr, inode->i_blkbits); if (err < 0) return err; - memset(addr + offset, 0, length); + clear_pmem(addr + offset, length); + wmb_pmem(); } return 0; diff --git a/fs/dcache.c b/fs/dcache.c index 592c4b582495..5c33aeb0f68f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -642,7 +642,7 @@ static inline bool fast_dput(struct dentry *dentry) /* * If we have a d_op->d_delete() operation, we sould not - * let the dentry count go to zero, so use "put__or_lock". + * let the dentry count go to zero, so use "put_or_lock". */ if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) return lockref_put_or_lock(&dentry->d_lockref); @@ -697,7 +697,7 @@ static inline bool fast_dput(struct dentry *dentry) */ smp_rmb(); d_flags = ACCESS_ONCE(dentry->d_flags); - d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST; + d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED; /* Nothing to do? Dropping the reference was all we needed? */ if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry)) @@ -776,6 +776,9 @@ repeat: if (unlikely(d_unhashed(dentry))) goto kill_it; + if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) + goto kill_it; + if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) { if (dentry->d_op->d_delete(dentry)) goto kill_it; @@ -1673,7 +1676,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | - DCACHE_OP_DELETE )); + DCACHE_OP_DELETE | + DCACHE_OP_SELECT_INODE)); dentry->d_op = op; if (!op) return; @@ -1689,6 +1693,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) dentry->d_flags |= DCACHE_OP_DELETE; if (op->d_prune) dentry->d_flags |= DCACHE_OP_PRUNE; + if (op->d_select_inode) + dentry->d_flags |= DCACHE_OP_SELECT_INODE; } EXPORT_SYMBOL(d_set_d_op); @@ -2712,7 +2718,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding - * dentry->d_parent->d_inode->i_mutex, inode->i_lock and rename_lock + * dentry->d_parent->d_inode->i_mutex, and rename_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... @@ -2738,7 +2744,6 @@ out_unalias: __d_move(alias, dentry, false); ret = 0; out_err: - spin_unlock(&inode->i_lock); if (m2) mutex_unlock(m2); if (m1) @@ -2784,10 +2789,11 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) if (S_ISDIR(inode->i_mode)) { struct dentry *new = __d_find_any_alias(inode); if (unlikely(new)) { + /* The reference to new ensures it remains an alias */ + spin_unlock(&inode->i_lock); write_seqlock(&rename_lock); if (unlikely(d_ancestor(new, dentry))) { write_sequnlock(&rename_lock); - spin_unlock(&inode->i_lock); dput(new); new = ERR_PTR(-ELOOP); pr_warn_ratelimited( @@ -2806,7 +2812,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) } else { __d_move(new, dentry, false); write_sequnlock(&rename_lock); - spin_unlock(&inode->i_lock); security_d_instantiate(new, inode); } iput(inode); @@ -2920,6 +2925,13 @@ restart: if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { struct mount *parent = ACCESS_ONCE(mnt->mnt_parent); + /* Escaped? */ + if (dentry != vfsmnt->mnt_root) { + bptr = *buffer; + blen = *buflen; + error = 3; + break; + } /* Global root? */ if (mnt != parent) { dentry = ACCESS_ONCE(mnt->mnt_mountpoint); @@ -2927,17 +2939,6 @@ restart: vfsmnt = &mnt->mnt; continue; } - /* - * Filesystems needing to implement special "root names" - * should do so with ->d_dname() - */ - if (IS_ROOT(dentry) && - (dentry->d_name.len != 1 || - dentry->d_name.name[0] != '/')) { - WARN(1, "Root dentry has weird name <%.*s>\n", - (int) dentry->d_name.len, - dentry->d_name.name); - } if (!error) error = is_mounted(vfsmnt) ? 1 : 2; break; @@ -3447,22 +3448,15 @@ void __init vfs_caches_init_early(void) inode_init_early(); } -void __init vfs_caches_init(unsigned long mempages) +void __init vfs_caches_init(void) { - unsigned long reserve; - - /* Base hash sizes on available memory, with a reserve equal to - 150% of current kernel size */ - - reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1); - mempages -= reserve; - names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); dcache_init(); inode_init(); - files_init(mempages); + files_init(); + files_maxfiles_init(); mnt_init(); bdev_cache_init(); chrdev_init(); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 284f9aa0028b..6c55ade071c3 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -435,8 +435,8 @@ struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode, } EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); -static ssize_t read_file_bool(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) +ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) { char buf[3]; u32 *val = file->private_data; @@ -449,9 +449,10 @@ static ssize_t read_file_bool(struct file *file, char __user *user_buf, buf[2] = 0x00; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } +EXPORT_SYMBOL_GPL(debugfs_read_file_bool); -static ssize_t write_file_bool(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) +ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) { char buf[32]; size_t buf_size; @@ -468,10 +469,11 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf, return count; } +EXPORT_SYMBOL_GPL(debugfs_write_file_bool); static const struct file_operations fops_bool = { - .read = read_file_bool, - .write = write_file_bool, + .read = debugfs_read_file_bool, + .write = debugfs_write_file_bool, .open = simple_open, .llseek = default_llseek, }; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 7eaec88ea970..c711be8d6a3c 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -44,11 +44,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb) return inode; } -static inline int debugfs_positive(struct dentry *dentry) -{ - return d_really_is_positive(dentry) && !d_unhashed(dentry); -} - struct debugfs_mount_opts { kuid_t uid; kgid_t gid; @@ -522,7 +517,7 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent) { int ret = 0; - if (debugfs_positive(dentry)) { + if (simple_positive(dentry)) { dget(dentry); if (d_is_dir(dentry)) ret = simple_rmdir(d_inode(parent), dentry); @@ -602,7 +597,7 @@ void debugfs_remove_recursive(struct dentry *dentry) */ spin_lock(&parent->d_lock); list_for_each_entry(child, &parent->d_subdirs, d_child) { - if (!debugfs_positive(child)) + if (!simple_positive(child)) continue; /* perhaps simple_empty(child) makes more sense */ @@ -623,7 +618,7 @@ void debugfs_remove_recursive(struct dentry *dentry) * from d_subdirs. When releasing the parent->d_lock we can * no longer trust that the next pointer is valid. * Restart the loop. We'll skip this one with the - * debugfs_positive() check. + * simple_positive() check. */ goto loop; } @@ -716,20 +711,17 @@ bool debugfs_initialized(void) } EXPORT_SYMBOL_GPL(debugfs_initialized); - -static struct kobject *debug_kobj; - static int __init debugfs_init(void) { int retval; - debug_kobj = kobject_create_and_add("debug", kernel_kobj); - if (!debug_kobj) - return -EINVAL; + retval = sysfs_create_mount_point(kernel_kobj, "debug"); + if (retval) + return retval; retval = register_filesystem(&debug_fs_type); if (retval) - kobject_put(debug_kobj); + sysfs_remove_mount_point(kernel_kobj, "debug"); else debugfs_registered = true; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index add566303c68..c35ffdc12bba 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -142,6 +142,8 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode) if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) return inode->i_sb; #endif + if (!devpts_mnt) + return NULL; return devpts_mnt->mnt_sb; } @@ -525,10 +527,14 @@ static struct file_system_type devpts_fs_type = { int devpts_new_index(struct inode *ptmx_inode) { struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct pts_fs_info *fsi = DEVPTS_SB(sb); + struct pts_fs_info *fsi; int index; int ida_ret; + if (!sb) + return -ENODEV; + + fsi = DEVPTS_SB(sb); retry: if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL)) return -ENOMEM; @@ -584,11 +590,18 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, struct dentry *dentry; struct super_block *sb = pts_sb_from_inode(ptmx_inode); struct inode *inode; - struct dentry *root = sb->s_root; - struct pts_fs_info *fsi = DEVPTS_SB(sb); - struct pts_mount_opts *opts = &fsi->mount_opts; + struct dentry *root; + struct pts_fs_info *fsi; + struct pts_mount_opts *opts; char s[12]; + if (!sb) + return ERR_PTR(-ENODEV); + + root = sb->s_root; + fsi = DEVPTS_SB(sb); + opts = &fsi->mount_opts; + inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -676,12 +689,16 @@ static int __init init_devpts_fs(void) struct ctl_table_header *table; if (!err) { + struct vfsmount *mnt; + table = register_sysctl_table(pty_root_table); - devpts_mnt = kern_mount(&devpts_fs_type); - if (IS_ERR(devpts_mnt)) { - err = PTR_ERR(devpts_mnt); + mnt = kern_mount(&devpts_fs_type); + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); unregister_filesystem(&devpts_fs_type); unregister_sysctl_table(table); + } else { + devpts_mnt = mnt; } } return err; diff --git a/fs/direct-io.c b/fs/direct-io.c index 745d2342651a..11256291642e 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -285,7 +285,7 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio); /* * Asynchronous IO callback. */ -static void dio_bio_end_aio(struct bio *bio, int error) +static void dio_bio_end_aio(struct bio *bio) { struct dio *dio = bio->bi_private; unsigned long remaining; @@ -318,7 +318,7 @@ static void dio_bio_end_aio(struct bio *bio, int error) * During I/O bi_private points at the dio. After I/O, bi_private is used to * implement a singly-linked list of completed BIOs, at dio->bio_list. */ -static void dio_bio_end_io(struct bio *bio, int error) +static void dio_bio_end_io(struct bio *bio) { struct dio *dio = bio->bi_private; unsigned long flags; @@ -345,9 +345,9 @@ void dio_end_io(struct bio *bio, int error) struct dio *dio = bio->bi_private; if (dio->is_async) - dio_bio_end_aio(bio, error); + dio_bio_end_aio(bio); else - dio_bio_end_io(bio, error); + dio_bio_end_io(bio); } EXPORT_SYMBOL_GPL(dio_end_io); @@ -457,15 +457,16 @@ static struct bio *dio_await_one(struct dio *dio) */ static int dio_bio_complete(struct dio *dio, struct bio *bio) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec; unsigned i; + int err; - if (!uptodate) + if (bio->bi_error) dio->io_error = -EIO; if (dio->is_async && dio->rw == READ) { bio_check_pages_dirty(bio); /* transfers ownership */ + err = bio->bi_error; } else { bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; @@ -474,9 +475,10 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) set_page_dirty_lock(page); page_cache_release(page); } + err = bio->bi_error; bio_put(bio); } - return uptodate ? 0 : -EIO; + return err; } /* @@ -653,7 +655,7 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio, if (ret) goto out; sector = start_sector << (sdio->blkbits - 9); - nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev)); + nr_pages = min(sdio->pages_in_io, BIO_MAX_PAGES); BUG_ON(nr_pages <= 0); dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages); sdio->boundary = 0; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 754fd6c0b747..87e9d796cf7d 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -120,12 +120,11 @@ struct connection { struct cbuf cb; int retries; #define MAX_CONNECT_RETRIES 3 - int sctp_assoc; struct hlist_node list; struct connection *othercon; struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ - bool try_new_addr; + void (*orig_error_report)(struct sock *sk); }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -252,26 +251,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation) return con; } -/* This is a bit drastic, but only called when things go wrong */ -static struct connection *assoc2con(int assoc_id) -{ - int i; - struct connection *con; - - mutex_lock(&connections_lock); - - for (i = 0 ; i < CONN_HASH_SIZE; i++) { - hlist_for_each_entry(con, &connection_hash[i], list) { - if (con->sctp_assoc == assoc_id) { - mutex_unlock(&connections_lock); - return con; - } - } - } - mutex_unlock(&connections_lock); - return NULL; -} - static struct dlm_node_addr *find_node_addr(int nodeid) { struct dlm_node_addr *na; @@ -322,14 +301,14 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, spin_lock(&dlm_node_addrs_spin); na = find_node_addr(nodeid); if (na && na->addr_count) { + memcpy(&sas, na->addr[na->curr_addr_index], + sizeof(struct sockaddr_storage)); + if (try_new_addr) { na->curr_addr_index++; if (na->curr_addr_index == na->addr_count) na->curr_addr_index = 0; } - - memcpy(&sas, na->addr[na->curr_addr_index ], - sizeof(struct sockaddr_storage)); } spin_unlock(&dlm_node_addrs_spin); @@ -459,18 +438,23 @@ static inline void lowcomms_connect_sock(struct connection *con) static void lowcomms_state_change(struct sock *sk) { - if (sk->sk_state == TCP_ESTABLISHED) + /* SCTP layer is not calling sk_data_ready when the connection + * is done, so we catch the signal through here. Also, it + * doesn't switch socket state when entering shutdown, so we + * skip the write in that case. + */ + if (sk->sk_shutdown) { + if (sk->sk_shutdown == RCV_SHUTDOWN) + lowcomms_data_ready(sk); + } else if (sk->sk_state == TCP_ESTABLISHED) { lowcomms_write_space(sk); + } } int dlm_lowcomms_connect_node(int nodeid) { struct connection *con; - /* with sctp there's no connecting without sending */ - if (dlm_config.ci_protocol != 0) - return 0; - if (nodeid == dlm_our_nodeid()) return 0; @@ -481,6 +465,43 @@ int dlm_lowcomms_connect_node(int nodeid) return 0; } +static void lowcomms_error_report(struct sock *sk) +{ + struct connection *con = sock2con(sk); + struct sockaddr_storage saddr; + + if (nodeid_to_addr(con->nodeid, &saddr, NULL, false)) { + printk_ratelimited(KERN_ERR "dlm: node %d: socket error " + "sending to node %d, port %d, " + "sk_err=%d/%d\n", dlm_our_nodeid(), + con->nodeid, dlm_config.ci_tcp_port, + sk->sk_err, sk->sk_err_soft); + return; + } else if (saddr.ss_family == AF_INET) { + struct sockaddr_in *sin4 = (struct sockaddr_in *)&saddr; + + printk_ratelimited(KERN_ERR "dlm: node %d: socket error " + "sending to node %d at %pI4, port %d, " + "sk_err=%d/%d\n", dlm_our_nodeid(), + con->nodeid, &sin4->sin_addr.s_addr, + dlm_config.ci_tcp_port, sk->sk_err, + sk->sk_err_soft); + } else { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&saddr; + + printk_ratelimited(KERN_ERR "dlm: node %d: socket error " + "sending to node %d at %u.%u.%u.%u, " + "port %d, sk_err=%d/%d\n", dlm_our_nodeid(), + con->nodeid, sin6->sin6_addr.s6_addr32[0], + sin6->sin6_addr.s6_addr32[1], + sin6->sin6_addr.s6_addr32[2], + sin6->sin6_addr.s6_addr32[3], + dlm_config.ci_tcp_port, sk->sk_err, + sk->sk_err_soft); + } + con->orig_error_report(sk); +} + /* Make a socket active */ static void add_sock(struct socket *sock, struct connection *con) { @@ -492,6 +513,8 @@ static void add_sock(struct socket *sock, struct connection *con) con->sock->sk->sk_state_change = lowcomms_state_change; con->sock->sk->sk_user_data = con; con->sock->sk->sk_allocation = GFP_NOFS; + con->orig_error_report = con->sock->sk->sk_error_report; + con->sock->sk->sk_error_report = lowcomms_error_report; } /* Add the port number to an IPv6 or 4 sockaddr and return the address @@ -514,17 +537,24 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, } /* Close a remote connection and tidy up */ -static void close_connection(struct connection *con, bool and_other) +static void close_connection(struct connection *con, bool and_other, + bool tx, bool rx) { - mutex_lock(&con->sock_mutex); + clear_bit(CF_CONNECT_PENDING, &con->flags); + clear_bit(CF_WRITE_PENDING, &con->flags); + if (tx && cancel_work_sync(&con->swork)) + log_print("canceled swork for node %d", con->nodeid); + if (rx && cancel_work_sync(&con->rwork)) + log_print("canceled rwork for node %d", con->nodeid); + mutex_lock(&con->sock_mutex); if (con->sock) { sock_release(con->sock); con->sock = NULL; } if (con->othercon && and_other) { /* Will only re-enter once. */ - close_connection(con->othercon, false); + close_connection(con->othercon, false, true, true); } if (con->rx_page) { __free_page(con->rx_page); @@ -535,254 +565,6 @@ static void close_connection(struct connection *con, bool and_other) mutex_unlock(&con->sock_mutex); } -/* We only send shutdown messages to nodes that are not part of the cluster */ -static void sctp_send_shutdown(sctp_assoc_t associd) -{ - static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; - struct msghdr outmessage; - struct cmsghdr *cmsg; - struct sctp_sndrcvinfo *sinfo; - int ret; - struct connection *con; - - con = nodeid2con(0,0); - BUG_ON(con == NULL); - - outmessage.msg_name = NULL; - outmessage.msg_namelen = 0; - outmessage.msg_control = outcmsg; - outmessage.msg_controllen = sizeof(outcmsg); - outmessage.msg_flags = MSG_EOR; - - cmsg = CMSG_FIRSTHDR(&outmessage); - cmsg->cmsg_level = IPPROTO_SCTP; - cmsg->cmsg_type = SCTP_SNDRCV; - cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); - outmessage.msg_controllen = cmsg->cmsg_len; - sinfo = CMSG_DATA(cmsg); - memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); - - sinfo->sinfo_flags |= MSG_EOF; - sinfo->sinfo_assoc_id = associd; - - ret = kernel_sendmsg(con->sock, &outmessage, NULL, 0, 0); - - if (ret != 0) - log_print("send EOF to node failed: %d", ret); -} - -static void sctp_init_failed_foreach(struct connection *con) -{ - - /* - * Don't try to recover base con and handle race where the - * other node's assoc init creates a assoc and we get that - * notification, then we get a notification that our attempt - * failed due. This happens when we are still trying the primary - * address, but the other node has already tried secondary addrs - * and found one that worked. - */ - if (!con->nodeid || con->sctp_assoc) - return; - - log_print("Retrying SCTP association init for node %d\n", con->nodeid); - - con->try_new_addr = true; - con->sctp_assoc = 0; - if (test_and_clear_bit(CF_INIT_PENDING, &con->flags)) { - if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) - queue_work(send_workqueue, &con->swork); - } -} - -/* INIT failed but we don't know which node... - restart INIT on all pending nodes */ -static void sctp_init_failed(void) -{ - mutex_lock(&connections_lock); - - foreach_conn(sctp_init_failed_foreach); - - mutex_unlock(&connections_lock); -} - -static void retry_failed_sctp_send(struct connection *recv_con, - struct sctp_send_failed *sn_send_failed, - char *buf) -{ - int len = sn_send_failed->ssf_length - sizeof(struct sctp_send_failed); - struct dlm_mhandle *mh; - struct connection *con; - char *retry_buf; - int nodeid = sn_send_failed->ssf_info.sinfo_ppid; - - log_print("Retry sending %d bytes to node id %d", len, nodeid); - - if (!nodeid) { - log_print("Shouldn't resend data via listening connection."); - return; - } - - con = nodeid2con(nodeid, 0); - if (!con) { - log_print("Could not look up con for nodeid %d\n", - nodeid); - return; - } - - mh = dlm_lowcomms_get_buffer(nodeid, len, GFP_NOFS, &retry_buf); - if (!mh) { - log_print("Could not allocate buf for retry."); - return; - } - memcpy(retry_buf, buf + sizeof(struct sctp_send_failed), len); - dlm_lowcomms_commit_buffer(mh); - - /* - * If we got a assoc changed event before the send failed event then - * we only need to retry the send. - */ - if (con->sctp_assoc) { - if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) - queue_work(send_workqueue, &con->swork); - } else - sctp_init_failed_foreach(con); -} - -/* Something happened to an association */ -static void process_sctp_notification(struct connection *con, - struct msghdr *msg, char *buf) -{ - union sctp_notification *sn = (union sctp_notification *)buf; - struct linger linger; - - switch (sn->sn_header.sn_type) { - case SCTP_SEND_FAILED: - retry_failed_sctp_send(con, &sn->sn_send_failed, buf); - break; - case SCTP_ASSOC_CHANGE: - switch (sn->sn_assoc_change.sac_state) { - case SCTP_COMM_UP: - case SCTP_RESTART: - { - /* Check that the new node is in the lockspace */ - struct sctp_prim prim; - int nodeid; - int prim_len, ret; - int addr_len; - struct connection *new_con; - - /* - * We get this before any data for an association. - * We verify that the node is in the cluster and - * then peel off a socket for it. - */ - if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) { - log_print("COMM_UP for invalid assoc ID %d", - (int)sn->sn_assoc_change.sac_assoc_id); - sctp_init_failed(); - return; - } - memset(&prim, 0, sizeof(struct sctp_prim)); - prim_len = sizeof(struct sctp_prim); - prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id; - - ret = kernel_getsockopt(con->sock, - IPPROTO_SCTP, - SCTP_PRIMARY_ADDR, - (char*)&prim, - &prim_len); - if (ret < 0) { - log_print("getsockopt/sctp_primary_addr on " - "new assoc %d failed : %d", - (int)sn->sn_assoc_change.sac_assoc_id, - ret); - - /* Retry INIT later */ - new_con = assoc2con(sn->sn_assoc_change.sac_assoc_id); - if (new_con) - clear_bit(CF_CONNECT_PENDING, &con->flags); - return; - } - make_sockaddr(&prim.ssp_addr, 0, &addr_len); - if (addr_to_nodeid(&prim.ssp_addr, &nodeid)) { - unsigned char *b=(unsigned char *)&prim.ssp_addr; - log_print("reject connect from unknown addr"); - print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, - b, sizeof(struct sockaddr_storage)); - sctp_send_shutdown(prim.ssp_assoc_id); - return; - } - - new_con = nodeid2con(nodeid, GFP_NOFS); - if (!new_con) - return; - - /* Peel off a new sock */ - lock_sock(con->sock->sk); - ret = sctp_do_peeloff(con->sock->sk, - sn->sn_assoc_change.sac_assoc_id, - &new_con->sock); - release_sock(con->sock->sk); - if (ret < 0) { - log_print("Can't peel off a socket for " - "connection %d to node %d: err=%d", - (int)sn->sn_assoc_change.sac_assoc_id, - nodeid, ret); - return; - } - add_sock(new_con->sock, new_con); - - linger.l_onoff = 1; - linger.l_linger = 0; - ret = kernel_setsockopt(new_con->sock, SOL_SOCKET, SO_LINGER, - (char *)&linger, sizeof(linger)); - if (ret < 0) - log_print("set socket option SO_LINGER failed"); - - log_print("connecting to %d sctp association %d", - nodeid, (int)sn->sn_assoc_change.sac_assoc_id); - - new_con->sctp_assoc = sn->sn_assoc_change.sac_assoc_id; - new_con->try_new_addr = false; - /* Send any pending writes */ - clear_bit(CF_CONNECT_PENDING, &new_con->flags); - clear_bit(CF_INIT_PENDING, &new_con->flags); - if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) { - queue_work(send_workqueue, &new_con->swork); - } - if (!test_and_set_bit(CF_READ_PENDING, &new_con->flags)) - queue_work(recv_workqueue, &new_con->rwork); - } - break; - - case SCTP_COMM_LOST: - case SCTP_SHUTDOWN_COMP: - { - con = assoc2con(sn->sn_assoc_change.sac_assoc_id); - if (con) { - con->sctp_assoc = 0; - } - } - break; - - case SCTP_CANT_STR_ASSOC: - { - /* Will retry init when we get the send failed notification */ - log_print("Can't start SCTP association - retrying"); - } - break; - - default: - log_print("unexpected SCTP assoc change id=%d state=%d", - (int)sn->sn_assoc_change.sac_assoc_id, - sn->sn_assoc_change.sac_state); - } - default: - ; /* fall through */ - } -} - /* Data received from remote end */ static int receive_from_sock(struct connection *con) { @@ -793,7 +575,6 @@ static int receive_from_sock(struct connection *con) int r; int call_again_soon = 0; int nvec; - char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; mutex_lock(&con->sock_mutex); @@ -801,6 +582,10 @@ static int receive_from_sock(struct connection *con) ret = -EAGAIN; goto out_close; } + if (con->nodeid == 0) { + ret = -EINVAL; + goto out_close; + } if (con->rx_page == NULL) { /* @@ -813,11 +598,6 @@ static int receive_from_sock(struct connection *con) cbuf_init(&con->cb, PAGE_CACHE_SIZE); } - /* Only SCTP needs these really */ - memset(&incmsg, 0, sizeof(incmsg)); - msg.msg_control = incmsg; - msg.msg_controllen = sizeof(incmsg); - /* * iov[0] is the bit of the circular buffer between the current end * point (cb.base + cb.len) and the end of the buffer. @@ -843,31 +623,18 @@ static int receive_from_sock(struct connection *con) MSG_DONTWAIT | MSG_NOSIGNAL); if (ret <= 0) goto out_close; - - /* Process SCTP notifications */ - if (msg.msg_flags & MSG_NOTIFICATION) { - msg.msg_control = incmsg; - msg.msg_controllen = sizeof(incmsg); - - process_sctp_notification(con, &msg, - page_address(con->rx_page) + con->cb.base); - mutex_unlock(&con->sock_mutex); - return 0; - } - BUG_ON(con->nodeid == 0); - - if (ret == len) + else if (ret == len) call_again_soon = 1; + cbuf_add(&con->cb, ret); ret = dlm_process_incoming_buffer(con->nodeid, page_address(con->rx_page), con->cb.base, con->cb.len, PAGE_CACHE_SIZE); if (ret == -EBADMSG) { - log_print("lowcomms: addr=%p, base=%u, len=%u, " - "iov_len=%u, iov_base[0]=%p, read=%d", - page_address(con->rx_page), con->cb.base, con->cb.len, - len, iov[0].iov_base, r); + log_print("lowcomms: addr=%p, base=%u, len=%u, read=%d", + page_address(con->rx_page), con->cb.base, + con->cb.len, r); } if (ret < 0) goto out_close; @@ -892,7 +659,7 @@ out_resched: out_close: mutex_unlock(&con->sock_mutex); if (ret != -EAGAIN) { - close_connection(con, false); + close_connection(con, false, true, false); /* Reconnect when there is something to send */ } /* Don't return success if we really got EOF */ @@ -1033,6 +800,120 @@ accept_err: return result; } +static int sctp_accept_from_sock(struct connection *con) +{ + /* Check that the new node is in the lockspace */ + struct sctp_prim prim; + int nodeid; + int prim_len, ret; + int addr_len; + struct connection *newcon; + struct connection *addcon; + struct socket *newsock; + + mutex_lock(&connections_lock); + if (!dlm_allow_conn) { + mutex_unlock(&connections_lock); + return -1; + } + mutex_unlock(&connections_lock); + + mutex_lock_nested(&con->sock_mutex, 0); + + ret = kernel_accept(con->sock, &newsock, O_NONBLOCK); + if (ret < 0) + goto accept_err; + + memset(&prim, 0, sizeof(struct sctp_prim)); + prim_len = sizeof(struct sctp_prim); + + ret = kernel_getsockopt(newsock, IPPROTO_SCTP, SCTP_PRIMARY_ADDR, + (char *)&prim, &prim_len); + if (ret < 0) { + log_print("getsockopt/sctp_primary_addr failed: %d", ret); + goto accept_err; + } + + make_sockaddr(&prim.ssp_addr, 0, &addr_len); + if (addr_to_nodeid(&prim.ssp_addr, &nodeid)) { + unsigned char *b = (unsigned char *)&prim.ssp_addr; + + log_print("reject connect from unknown addr"); + print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, + b, sizeof(struct sockaddr_storage)); + goto accept_err; + } + + newcon = nodeid2con(nodeid, GFP_NOFS); + if (!newcon) { + ret = -ENOMEM; + goto accept_err; + } + + mutex_lock_nested(&newcon->sock_mutex, 1); + + if (newcon->sock) { + struct connection *othercon = newcon->othercon; + + if (!othercon) { + othercon = kmem_cache_zalloc(con_cache, GFP_NOFS); + if (!othercon) { + log_print("failed to allocate incoming socket"); + mutex_unlock(&newcon->sock_mutex); + ret = -ENOMEM; + goto accept_err; + } + othercon->nodeid = nodeid; + othercon->rx_action = receive_from_sock; + mutex_init(&othercon->sock_mutex); + INIT_WORK(&othercon->swork, process_send_sockets); + INIT_WORK(&othercon->rwork, process_recv_sockets); + set_bit(CF_IS_OTHERCON, &othercon->flags); + } + if (!othercon->sock) { + newcon->othercon = othercon; + othercon->sock = newsock; + newsock->sk->sk_user_data = othercon; + add_sock(newsock, othercon); + addcon = othercon; + } else { + printk("Extra connection from node %d attempted\n", nodeid); + ret = -EAGAIN; + mutex_unlock(&newcon->sock_mutex); + goto accept_err; + } + } else { + newsock->sk->sk_user_data = newcon; + newcon->rx_action = receive_from_sock; + add_sock(newsock, newcon); + addcon = newcon; + } + + log_print("connected to %d", nodeid); + + mutex_unlock(&newcon->sock_mutex); + + /* + * Add it to the active queue in case we got data + * between processing the accept adding the socket + * to the read_sockets list + */ + if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags)) + queue_work(recv_workqueue, &addcon->rwork); + mutex_unlock(&con->sock_mutex); + + return 0; + +accept_err: + mutex_unlock(&con->sock_mutex); + if (newsock) + sock_release(newsock); + if (ret != -EAGAIN) + log_print("error accepting connection from node: %d", ret); + + return ret; +} + static void free_entry(struct writequeue_entry *e) { __free_page(e->page); @@ -1057,97 +938,129 @@ static void writequeue_entry_complete(struct writequeue_entry *e, int completed) } } +/* + * sctp_bind_addrs - bind a SCTP socket to all our addresses + */ +static int sctp_bind_addrs(struct connection *con, uint16_t port) +{ + struct sockaddr_storage localaddr; + int i, addr_len, result = 0; + + for (i = 0; i < dlm_local_count; i++) { + memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr)); + make_sockaddr(&localaddr, port, &addr_len); + + if (!i) + result = kernel_bind(con->sock, + (struct sockaddr *)&localaddr, + addr_len); + else + result = kernel_setsockopt(con->sock, SOL_SCTP, + SCTP_SOCKOPT_BINDX_ADD, + (char *)&localaddr, addr_len); + + if (result < 0) { + log_print("Can't bind to %d addr number %d, %d.\n", + port, i + 1, result); + break; + } + } + return result; +} + /* Initiate an SCTP association. This is a special case of send_to_sock() in that we don't yet have a peeled-off socket for this association, so we use the listening socket and add the primary IP address of the remote node. */ -static void sctp_init_assoc(struct connection *con) +static void sctp_connect_to_sock(struct connection *con) { - struct sockaddr_storage rem_addr; - char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; - struct msghdr outmessage; - struct cmsghdr *cmsg; - struct sctp_sndrcvinfo *sinfo; - struct connection *base_con; - struct writequeue_entry *e; - int len, offset; - int ret; - int addrlen; - struct kvec iov[1]; + struct sockaddr_storage daddr; + int one = 1; + int result; + int addr_len; + struct socket *sock; + + if (con->nodeid == 0) { + log_print("attempt to connect sock 0 foiled"); + return; + } mutex_lock(&con->sock_mutex); - if (test_and_set_bit(CF_INIT_PENDING, &con->flags)) - goto unlock; - if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr, - con->try_new_addr)) { + /* Some odd races can cause double-connects, ignore them */ + if (con->retries++ > MAX_CONNECT_RETRIES) + goto out; + + if (con->sock) { + log_print("node %d already connected.", con->nodeid); + goto out; + } + + memset(&daddr, 0, sizeof(daddr)); + result = nodeid_to_addr(con->nodeid, &daddr, NULL, true); + if (result < 0) { log_print("no address for nodeid %d", con->nodeid); - goto unlock; + goto out; } - base_con = nodeid2con(0, 0); - BUG_ON(base_con == NULL); - make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen); + /* Create a socket to communicate with */ + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_STREAM, IPPROTO_SCTP, &sock); + if (result < 0) + goto socket_err; - outmessage.msg_name = &rem_addr; - outmessage.msg_namelen = addrlen; - outmessage.msg_control = outcmsg; - outmessage.msg_controllen = sizeof(outcmsg); - outmessage.msg_flags = MSG_EOR; + sock->sk->sk_user_data = con; + con->rx_action = receive_from_sock; + con->connect_action = sctp_connect_to_sock; + add_sock(sock, con); - spin_lock(&con->writequeue_lock); + /* Bind to all addresses. */ + if (sctp_bind_addrs(con, 0)) + goto bind_err; - if (list_empty(&con->writequeue)) { - spin_unlock(&con->writequeue_lock); - log_print("writequeue empty for nodeid %d", con->nodeid); - goto unlock; - } + make_sockaddr(&daddr, dlm_config.ci_tcp_port, &addr_len); - e = list_first_entry(&con->writequeue, struct writequeue_entry, list); - len = e->len; - offset = e->offset; + log_print("connecting to %d", con->nodeid); - /* Send the first block off the write queue */ - iov[0].iov_base = page_address(e->page)+offset; - iov[0].iov_len = len; - spin_unlock(&con->writequeue_lock); + /* Turn off Nagle's algorithm */ + kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, + sizeof(one)); - if (rem_addr.ss_family == AF_INET) { - struct sockaddr_in *sin = (struct sockaddr_in *)&rem_addr; - log_print("Trying to connect to %pI4", &sin->sin_addr.s_addr); - } else { - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&rem_addr; - log_print("Trying to connect to %pI6", &sin6->sin6_addr); - } + result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len, + O_NONBLOCK); + if (result == -EINPROGRESS) + result = 0; + if (result == 0) + goto out; - cmsg = CMSG_FIRSTHDR(&outmessage); - cmsg->cmsg_level = IPPROTO_SCTP; - cmsg->cmsg_type = SCTP_SNDRCV; - cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); - sinfo = CMSG_DATA(cmsg); - memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); - sinfo->sinfo_ppid = cpu_to_le32(con->nodeid); - outmessage.msg_controllen = cmsg->cmsg_len; - sinfo->sinfo_flags |= SCTP_ADDR_OVER; - ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len); - if (ret < 0) { - log_print("Send first packet to node %d failed: %d", - con->nodeid, ret); +bind_err: + con->sock = NULL; + sock_release(sock); - /* Try again later */ +socket_err: + /* + * Some errors are fatal and this list might need adjusting. For other + * errors we try again until the max number of retries is reached. + */ + if (result != -EHOSTUNREACH && + result != -ENETUNREACH && + result != -ENETDOWN && + result != -EINVAL && + result != -EPROTONOSUPPORT) { + log_print("connect %d try %d error %d", con->nodeid, + con->retries, result); + mutex_unlock(&con->sock_mutex); + msleep(1000); clear_bit(CF_CONNECT_PENDING, &con->flags); - clear_bit(CF_INIT_PENDING, &con->flags); - } - else { - spin_lock(&con->writequeue_lock); - writequeue_entry_complete(e, ret); - spin_unlock(&con->writequeue_lock); + lowcomms_connect_sock(con); + return; } -unlock: +out: mutex_unlock(&con->sock_mutex); + set_bit(CF_WRITE_PENDING, &con->flags); } /* Connect a new socket to its peer */ @@ -1236,11 +1149,13 @@ out_err: con->retries, result); mutex_unlock(&con->sock_mutex); msleep(1000); + clear_bit(CF_CONNECT_PENDING, &con->flags); lowcomms_connect_sock(con); return; } out: mutex_unlock(&con->sock_mutex); + set_bit(CF_WRITE_PENDING, &con->flags); return; } @@ -1325,37 +1240,11 @@ static void init_local(void) } } -/* Bind to an IP address. SCTP allows multiple address so it can do - multi-homing */ -static int add_sctp_bind_addr(struct connection *sctp_con, - struct sockaddr_storage *addr, - int addr_len, int num) -{ - int result = 0; - - if (num == 1) - result = kernel_bind(sctp_con->sock, - (struct sockaddr *) addr, - addr_len); - else - result = kernel_setsockopt(sctp_con->sock, SOL_SCTP, - SCTP_SOCKOPT_BINDX_ADD, - (char *)addr, addr_len); - - if (result < 0) - log_print("Can't bind to port %d addr number %d", - dlm_config.ci_tcp_port, num); - - return result; -} - /* Initialise SCTP socket and bind to all interfaces */ static int sctp_listen_for_all(void) { struct socket *sock = NULL; - struct sockaddr_storage localaddr; - struct sctp_event_subscribe subscribe; - int result = -EINVAL, num = 1, i, addr_len; + int result = -EINVAL; struct connection *con = nodeid2con(0, GFP_NOFS); int bufsize = NEEDED_RMEM; int one = 1; @@ -1366,33 +1255,17 @@ static int sctp_listen_for_all(void) log_print("Using SCTP for communications"); result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, - SOCK_SEQPACKET, IPPROTO_SCTP, &sock); + SOCK_STREAM, IPPROTO_SCTP, &sock); if (result < 0) { log_print("Can't create comms socket, check SCTP is loaded"); goto out; } - /* Listen for events */ - memset(&subscribe, 0, sizeof(subscribe)); - subscribe.sctp_data_io_event = 1; - subscribe.sctp_association_event = 1; - subscribe.sctp_send_failure_event = 1; - subscribe.sctp_shutdown_event = 1; - subscribe.sctp_partial_delivery_event = 1; - result = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUFFORCE, (char *)&bufsize, sizeof(bufsize)); if (result) log_print("Error increasing buffer space on socket %d", result); - result = kernel_setsockopt(sock, SOL_SCTP, SCTP_EVENTS, - (char *)&subscribe, sizeof(subscribe)); - if (result < 0) { - log_print("Failed to set SCTP_EVENTS on socket: result=%d", - result); - goto create_delsock; - } - result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one, sizeof(one)); if (result < 0) @@ -1402,19 +1275,12 @@ static int sctp_listen_for_all(void) sock->sk->sk_user_data = con; con->sock = sock; con->sock->sk->sk_data_ready = lowcomms_data_ready; - con->rx_action = receive_from_sock; - con->connect_action = sctp_init_assoc; + con->rx_action = sctp_accept_from_sock; + con->connect_action = sctp_connect_to_sock; - /* Bind to all interfaces. */ - for (i = 0; i < dlm_local_count; i++) { - memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr)); - make_sockaddr(&localaddr, dlm_config.ci_tcp_port, &addr_len); - - result = add_sctp_bind_addr(con, &localaddr, addr_len, num); - if (result) - goto create_delsock; - ++num; - } + /* Bind to all addresses. */ + if (sctp_bind_addrs(con, dlm_config.ci_tcp_port)) + goto create_delsock; result = sock->ops->listen(sock, 5); if (result < 0) { @@ -1612,14 +1478,13 @@ out: send_error: mutex_unlock(&con->sock_mutex); - close_connection(con, false); + close_connection(con, false, false, true); lowcomms_connect_sock(con); return; out_connect: mutex_unlock(&con->sock_mutex); - if (!test_bit(CF_INIT_PENDING, &con->flags)) - lowcomms_connect_sock(con); + lowcomms_connect_sock(con); } static void clean_one_writequeue(struct connection *con) @@ -1644,15 +1509,9 @@ int dlm_lowcomms_close(int nodeid) log_print("closing connection to node %d", nodeid); con = nodeid2con(nodeid, 0); if (con) { - clear_bit(CF_CONNECT_PENDING, &con->flags); - clear_bit(CF_WRITE_PENDING, &con->flags); set_bit(CF_CLOSE, &con->flags); - if (cancel_work_sync(&con->swork)) - log_print("canceled swork for node %d", nodeid); - if (cancel_work_sync(&con->rwork)) - log_print("canceled rwork for node %d", nodeid); + close_connection(con, true, true, true); clean_one_writequeue(con); - close_connection(con, true); } spin_lock(&dlm_node_addrs_spin); @@ -1685,10 +1544,8 @@ static void process_send_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, swork); - if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) { + if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) con->connect_action(con); - set_bit(CF_WRITE_PENDING, &con->flags); - } if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags)) send_to_sock(con); } @@ -1735,7 +1592,7 @@ static void stop_conn(struct connection *con) static void free_conn(struct connection *con) { - close_connection(con, true); + close_connection(con, true, true, true); if (con->othercon) kmem_cache_free(con_cache, con->othercon); hlist_del(&con->list); @@ -1806,7 +1663,7 @@ fail_unlisten: dlm_allow_conn = 0; con = nodeid2con(0,0); if (con) { - close_connection(con, false); + close_connection(con, false, true, true); kmem_cache_free(con_cache, con); } fail_destroy: diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index e0ab3a93eeff..5532f097f6da 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -509,7 +509,6 @@ int dlm_plock_init(void) void dlm_plock_exit(void) { - if (misc_deregister(&plock_dev_misc) < 0) - log_print("dlm_plock_exit: misc_deregister failed"); + misc_deregister(&plock_dev_misc); } diff --git a/fs/dlm/user.c b/fs/dlm/user.c index fb85f32e9eca..173b3873a4f4 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -362,18 +362,15 @@ fail: int dlm_device_deregister(struct dlm_ls *ls) { - int error; - /* The device is not registered. This happens when the lockspace was never used from userspace, or when device_create_lockspace() calls dlm_release_lockspace() after the register fails. */ if (!ls->ls_device.name) return 0; - error = misc_deregister(&ls->ls_device); - if (!error) - kfree(ls->ls_device.name); - return error; + misc_deregister(&ls->ls_device); + kfree(ls->ls_device.name); + return 0; } static int device_user_purge(struct dlm_user_proc *proc, @@ -785,6 +782,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, DECLARE_WAITQUEUE(wait, current); struct dlm_callback cb; int rv, resid, copy_lvb = 0; + int old_mode, new_mode; if (count == sizeof(struct dlm_device_version)) { rv = copy_version_to_user(buf, count); @@ -841,6 +839,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_cb_list); + /* rem_lkb_callback sets a new lkb_last_cast */ + old_mode = lkb->lkb_last_cast.mode; + rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid); if (rv < 0) { /* this shouldn't happen; lkb should have been removed from @@ -864,9 +865,6 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, } if (cb.flags & DLM_CB_CAST) { - int old_mode, new_mode; - - old_mode = lkb->lkb_last_cast.mode; new_mode = cb.mode; if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr && diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 5718cb9f7273..d72d52b90433 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -17,7 +17,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) { struct inode *inode, *toput_inode = NULL; - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || @@ -27,13 +27,15 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); + invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; - spin_lock(&inode_sb_list_lock); + + spin_lock(&sb->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); iput(toput_inode); } diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 97315f2f6816..80d6901493cf 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -258,8 +258,7 @@ void ecryptfs_destroy_mount_crypt_stat( &mount_crypt_stat->global_auth_tok_list, mount_crypt_stat_list) { list_del(&auth_tok->mount_crypt_stat_list); - if (auth_tok->global_auth_tok_key - && !(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID)) + if (!(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID)) key_put(auth_tok->global_auth_tok_key); kmem_cache_free(ecryptfs_global_auth_tok_cache, auth_tok); } diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 8db0b464483f..63cd2c147221 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -45,20 +45,20 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - int rc; - - if (!(lower_dentry->d_flags & DCACHE_OP_REVALIDATE)) - return 1; + int rc = 1; if (flags & LOOKUP_RCU) return -ECHILD; - rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags); + if (lower_dentry->d_flags & DCACHE_OP_REVALIDATE) + rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags); + if (d_really_is_positive(dentry)) { - struct inode *lower_inode = - ecryptfs_inode_to_lower(d_inode(dentry)); + struct inode *inode = d_inode(dentry); - fsstack_copy_attr_all(d_inode(dentry), lower_inode); + fsstack_copy_attr_all(inode, ecryptfs_inode_to_lower(inode)); + if (!inode->i_nlink) + return 0; } return rc; } diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 72afcc629d7b..feef8a9c4de7 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -325,7 +325,6 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return rc; switch (cmd) { - case FITRIM: case FS_IOC32_GETFLAGS: case FS_IOC32_SETFLAGS: case FS_IOC32_GETVERSION: diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index cf208522998e..caba848ac763 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -299,7 +299,7 @@ static int ecryptfs_write_begin(struct file *file, rc = ecryptfs_read_lower_page_segment( page, index, 0, PAGE_CACHE_SIZE, mapping->host); if (rc) { - printk(KERN_ERR "%s: Error attemping to read " + printk(KERN_ERR "%s: Error attempting to read " "lower page segment; rc = [%d]\n", __func__, rc); ClearPageUptodate(page); diff --git a/fs/exec.c b/fs/exec.c index 1977c2a553ac..b06623a9347f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -98,6 +98,12 @@ static inline void put_binfmt(struct linux_binfmt * fmt) module_put(fmt->module); } +bool path_noexec(const struct path *path) +{ + return (path->mnt->mnt_flags & MNT_NOEXEC) || + (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC); +} + #ifdef CONFIG_USELIB /* * Note that a shared library must be both readable and executable due to @@ -132,7 +138,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) goto exit; error = -EACCES; - if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) + if (path_noexec(&file->f_path)) goto exit; fsnotify_open(file); @@ -777,7 +783,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) if (!S_ISREG(file_inode(file)->i_mode)) goto exit; - if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) + if (path_noexec(&file->f_path)) goto exit; err = deny_write_access(file); diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 4deb0b05b011..e5bb2abf77f9 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -44,12 +44,6 @@ static inline void exofs_put_page(struct page *page) page_cache_release(page); } -/* Accesses dir's inode->i_size must be called under inode lock */ -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -} - static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr) { loff_t last_byte = inode->i_size; diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 796b491e6978..0c6638b40f21 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -70,11 +70,6 @@ static inline void ext2_put_page(struct page *page) page_cache_release(page); } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - /* * Return the offset into page `page_nr' of the last valid * byte in that page, plus one. diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 3a0a6c6406d0..1982c3f11aec 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -20,6 +20,7 @@ #include <linux/time.h> #include <linux/pagemap.h> +#include <linux/dax.h> #include <linux/quotaops.h> #include "ext2.h" #include "xattr.h" @@ -28,16 +29,23 @@ #ifdef CONFIG_FS_DAX static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_fault(vma, vmf, ext2_get_block); + return dax_fault(vma, vmf, ext2_get_block, NULL); +} + +static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned int flags) +{ + return dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL); } static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_mkwrite(vma, vmf, ext2_get_block); + return dax_mkwrite(vma, vmf, ext2_get_block, NULL); } static const struct vm_operations_struct ext2_dax_vm_ops = { .fault = ext2_dax_fault, + .pmd_fault = ext2_dax_pmd_fault, .page_mkwrite = ext2_dax_mkwrite, .pfn_mkwrite = dax_pfn_mkwrite, }; @@ -49,7 +57,7 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); vma->vm_ops = &ext2_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; return 0; } #else diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 5c04a0ddea80..efe5fb21c533 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -577,7 +577,10 @@ got: goto fail; } - dquot_initialize(inode); + err = dquot_initialize(inode); + if (err) + goto fail_drop; + err = dquot_alloc_inode(inode); if (err) goto fail_drop; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 5c09776d347f..c60a248c640c 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -25,6 +25,7 @@ #include <linux/time.h> #include <linux/highuid.h> #include <linux/pagemap.h> +#include <linux/dax.h> #include <linux/quotaops.h> #include <linux/writeback.h> #include <linux/buffer_head.h> @@ -1552,8 +1553,11 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr) if (error) return error; - if (is_quota_modification(inode, iattr)) - dquot_initialize(inode); + if (is_quota_modification(inode, iattr)) { + error = dquot_initialize(inode); + if (error) + return error; + } if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { error = dquot_transfer(inode, iattr); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 13ec54a99c96..b4841e3066a5 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -96,8 +96,11 @@ struct dentry *ext2_get_parent(struct dentry *child) static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) { struct inode *inode; + int err; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + return err; inode = ext2_new_inode(dir, mode, &dentry->d_name); if (IS_ERR(inode)) @@ -143,7 +146,9 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, if (!new_valid_dev(rdev)) return -EINVAL; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + return err; inode = ext2_new_inode (dir, mode, &dentry->d_name); err = PTR_ERR(inode); @@ -169,7 +174,9 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry, if (l > sb->s_blocksize) goto out; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + goto out; inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO, &dentry->d_name); err = PTR_ERR(inode); @@ -212,7 +219,9 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, struct inode *inode = d_inode(old_dentry); int err; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + return err; inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); @@ -233,7 +242,9 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) struct inode * inode; int err; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + return err; inode_inc_link_count(dir); @@ -279,13 +290,17 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry) struct inode * inode = d_inode(dentry); struct ext2_dir_entry_2 * de; struct page * page; - int err = -ENOENT; + int err; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + goto out; de = ext2_find_entry (dir, &dentry->d_name, &page); - if (!de) + if (!de) { + err = -ENOENT; goto out; + } err = ext2_delete_entry (de, page); if (err) @@ -323,14 +338,21 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, struct ext2_dir_entry_2 * dir_de = NULL; struct page * old_page; struct ext2_dir_entry_2 * old_de; - int err = -ENOENT; + int err; + + err = dquot_initialize(old_dir); + if (err) + goto out; - dquot_initialize(old_dir); - dquot_initialize(new_dir); + err = dquot_initialize(new_dir); + if (err) + goto out; old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page); - if (!old_de) + if (!old_de) { + err = -ENOENT; goto out; + } if (S_ISDIR(old_inode->i_mode)) { err = -EIO; diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig deleted file mode 100644 index e8c6ba0e4a3e..000000000000 --- a/fs/ext3/Kconfig +++ /dev/null @@ -1,89 +0,0 @@ -config EXT3_FS - tristate "Ext3 journalling file system support" - select JBD - help - This is the journalling version of the Second extended file system - (often called ext3), the de facto standard Linux file system - (method to organize files on a storage device) for hard disks. - - The journalling code included in this driver means you do not have - to run e2fsck (file system checker) on your file systems after a - crash. The journal keeps track of any changes that were being made - at the time the system crashed, and can ensure that your file system - is consistent without the need for a lengthy check. - - Other than adding the journal to the file system, the on-disk format - of ext3 is identical to ext2. It is possible to freely switch - between using the ext3 driver and the ext2 driver, as long as the - file system has been cleanly unmounted, or e2fsck is run on the file - system. - - To add a journal on an existing ext2 file system or change the - behavior of ext3 file systems, you can use the tune2fs utility ("man - tune2fs"). To modify attributes of files and directories on ext3 - file systems, use chattr ("man chattr"). You need to be using - e2fsprogs version 1.20 or later in order to create ext3 journals - (available at <http://sourceforge.net/projects/e2fsprogs/>). - - To compile this file system support as a module, choose M here: the - module will be called ext3. - -config EXT3_DEFAULTS_TO_ORDERED - bool "Default to 'data=ordered' in ext3" - depends on EXT3_FS - default y - help - The journal mode options for ext3 have different tradeoffs - between when data is guaranteed to be on disk and - performance. The use of "data=writeback" can cause - unwritten data to appear in files after an system crash or - power failure, which can be a security issue. However, - "data=ordered" mode can also result in major performance - problems, including seconds-long delays before an fsync() - call returns. For details, see: - - http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs - - If you have been historically happy with ext3's performance, - data=ordered mode will be a safe choice and you should - answer 'y' here. If you understand the reliability and data - privacy issues of data=writeback and are willing to make - that trade off, answer 'n'. - -config EXT3_FS_XATTR - bool "Ext3 extended attributes" - depends on EXT3_FS - default y - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). - - If unsure, say N. - - You need this for POSIX ACL support on ext3. - -config EXT3_FS_POSIX_ACL - bool "Ext3 POSIX Access Control Lists" - depends on EXT3_FS_XATTR - select FS_POSIX_ACL - help - Posix Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. - - To learn more about Access Control Lists, visit the Posix ACLs for - Linux website <http://acl.bestbits.at/>. - - If you don't know what Access Control Lists are, say N - -config EXT3_FS_SECURITY - bool "Ext3 Security Labels" - depends on EXT3_FS_XATTR - help - Security labels support alternative access control models - implemented by security modules like SELinux. This option - enables an extended attribute handler for file security - labels in the ext3 filesystem. - - If you are not using a security module that requires using - extended attributes for file security labels, say N. diff --git a/fs/ext3/Makefile b/fs/ext3/Makefile deleted file mode 100644 index e77766a8b3f0..000000000000 --- a/fs/ext3/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -# -# Makefile for the linux ext3-filesystem routines. -# - -obj-$(CONFIG_EXT3_FS) += ext3.o - -ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o ext3_jbd.o - -ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o -ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o -ext3-$(CONFIG_EXT3_FS_SECURITY) += xattr_security.o diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c deleted file mode 100644 index 8bbaf5bcf982..000000000000 --- a/fs/ext3/acl.c +++ /dev/null @@ -1,281 +0,0 @@ -/* - * linux/fs/ext3/acl.c - * - * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> - */ - -#include "ext3.h" -#include "xattr.h" -#include "acl.h" - -/* - * Convert from filesystem to in-memory representation. - */ -static struct posix_acl * -ext3_acl_from_disk(const void *value, size_t size) -{ - const char *end = (char *)value + size; - int n, count; - struct posix_acl *acl; - - if (!value) - return NULL; - if (size < sizeof(ext3_acl_header)) - return ERR_PTR(-EINVAL); - if (((ext3_acl_header *)value)->a_version != - cpu_to_le32(EXT3_ACL_VERSION)) - return ERR_PTR(-EINVAL); - value = (char *)value + sizeof(ext3_acl_header); - count = ext3_acl_count(size); - if (count < 0) - return ERR_PTR(-EINVAL); - if (count == 0) - return NULL; - acl = posix_acl_alloc(count, GFP_NOFS); - if (!acl) - return ERR_PTR(-ENOMEM); - for (n=0; n < count; n++) { - ext3_acl_entry *entry = - (ext3_acl_entry *)value; - if ((char *)value + sizeof(ext3_acl_entry_short) > end) - goto fail; - acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); - acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); - switch(acl->a_entries[n].e_tag) { - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - value = (char *)value + - sizeof(ext3_acl_entry_short); - break; - - case ACL_USER: - value = (char *)value + sizeof(ext3_acl_entry); - if ((char *)value > end) - goto fail; - acl->a_entries[n].e_uid = - make_kuid(&init_user_ns, - le32_to_cpu(entry->e_id)); - break; - case ACL_GROUP: - value = (char *)value + sizeof(ext3_acl_entry); - if ((char *)value > end) - goto fail; - acl->a_entries[n].e_gid = - make_kgid(&init_user_ns, - le32_to_cpu(entry->e_id)); - break; - - default: - goto fail; - } - } - if (value != end) - goto fail; - return acl; - -fail: - posix_acl_release(acl); - return ERR_PTR(-EINVAL); -} - -/* - * Convert from in-memory to filesystem representation. - */ -static void * -ext3_acl_to_disk(const struct posix_acl *acl, size_t *size) -{ - ext3_acl_header *ext_acl; - char *e; - size_t n; - - *size = ext3_acl_size(acl->a_count); - ext_acl = kmalloc(sizeof(ext3_acl_header) + acl->a_count * - sizeof(ext3_acl_entry), GFP_NOFS); - if (!ext_acl) - return ERR_PTR(-ENOMEM); - ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION); - e = (char *)ext_acl + sizeof(ext3_acl_header); - for (n=0; n < acl->a_count; n++) { - const struct posix_acl_entry *acl_e = &acl->a_entries[n]; - ext3_acl_entry *entry = (ext3_acl_entry *)e; - entry->e_tag = cpu_to_le16(acl_e->e_tag); - entry->e_perm = cpu_to_le16(acl_e->e_perm); - switch(acl_e->e_tag) { - case ACL_USER: - entry->e_id = cpu_to_le32( - from_kuid(&init_user_ns, acl_e->e_uid)); - e += sizeof(ext3_acl_entry); - break; - case ACL_GROUP: - entry->e_id = cpu_to_le32( - from_kgid(&init_user_ns, acl_e->e_gid)); - e += sizeof(ext3_acl_entry); - break; - - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - e += sizeof(ext3_acl_entry_short); - break; - - default: - goto fail; - } - } - return (char *)ext_acl; - -fail: - kfree(ext_acl); - return ERR_PTR(-EINVAL); -} - -/* - * Inode operation get_posix_acl(). - * - * inode->i_mutex: don't care - */ -struct posix_acl * -ext3_get_acl(struct inode *inode, int type) -{ - int name_index; - char *value = NULL; - struct posix_acl *acl; - int retval; - - switch (type) { - case ACL_TYPE_ACCESS: - name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; - break; - case ACL_TYPE_DEFAULT: - name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT; - break; - default: - BUG(); - } - - retval = ext3_xattr_get(inode, name_index, "", NULL, 0); - if (retval > 0) { - value = kmalloc(retval, GFP_NOFS); - if (!value) - return ERR_PTR(-ENOMEM); - retval = ext3_xattr_get(inode, name_index, "", value, retval); - } - if (retval > 0) - acl = ext3_acl_from_disk(value, retval); - else if (retval == -ENODATA || retval == -ENOSYS) - acl = NULL; - else - acl = ERR_PTR(retval); - kfree(value); - - if (!IS_ERR(acl)) - set_cached_acl(inode, type, acl); - - return acl; -} - -/* - * Set the access or default ACL of an inode. - * - * inode->i_mutex: down unless called from ext3_new_inode - */ -static int -__ext3_set_acl(handle_t *handle, struct inode *inode, int type, - struct posix_acl *acl) -{ - int name_index; - void *value = NULL; - size_t size = 0; - int error; - - switch(type) { - case ACL_TYPE_ACCESS: - name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) - return error; - else { - inode->i_ctime = CURRENT_TIME_SEC; - ext3_mark_inode_dirty(handle, inode); - if (error == 0) - acl = NULL; - } - } - break; - - case ACL_TYPE_DEFAULT: - name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT; - if (!S_ISDIR(inode->i_mode)) - return acl ? -EACCES : 0; - break; - - default: - return -EINVAL; - } - if (acl) { - value = ext3_acl_to_disk(acl, &size); - if (IS_ERR(value)) - return (int)PTR_ERR(value); - } - - error = ext3_xattr_set_handle(handle, inode, name_index, "", - value, size, 0); - - kfree(value); - - if (!error) - set_cached_acl(inode, type, acl); - - return error; -} - -int -ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type) -{ - handle_t *handle; - int error, retries = 0; - -retry: - handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - error = __ext3_set_acl(handle, inode, type, acl); - ext3_journal_stop(handle); - if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - return error; -} - -/* - * Initialize the ACLs of a new inode. Called from ext3_new_inode. - * - * dir->i_mutex: down - * inode->i_mutex: up (access to inode is still exclusive) - */ -int -ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) -{ - struct posix_acl *default_acl, *acl; - int error; - - error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); - if (error) - return error; - - if (default_acl) { - error = __ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT, - default_acl); - posix_acl_release(default_acl); - } - if (acl) { - if (!error) - error = __ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, - acl); - posix_acl_release(acl); - } - return error; -} diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h deleted file mode 100644 index ea1c69edab9e..000000000000 --- a/fs/ext3/acl.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - File: fs/ext3/acl.h - - (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> -*/ - -#include <linux/posix_acl_xattr.h> - -#define EXT3_ACL_VERSION 0x0001 - -typedef struct { - __le16 e_tag; - __le16 e_perm; - __le32 e_id; -} ext3_acl_entry; - -typedef struct { - __le16 e_tag; - __le16 e_perm; -} ext3_acl_entry_short; - -typedef struct { - __le32 a_version; -} ext3_acl_header; - -static inline size_t ext3_acl_size(int count) -{ - if (count <= 4) { - return sizeof(ext3_acl_header) + - count * sizeof(ext3_acl_entry_short); - } else { - return sizeof(ext3_acl_header) + - 4 * sizeof(ext3_acl_entry_short) + - (count - 4) * sizeof(ext3_acl_entry); - } -} - -static inline int ext3_acl_count(size_t size) -{ - ssize_t s; - size -= sizeof(ext3_acl_header); - s = size - 4 * sizeof(ext3_acl_entry_short); - if (s < 0) { - if (size % sizeof(ext3_acl_entry_short)) - return -1; - return size / sizeof(ext3_acl_entry_short); - } else { - if (s % sizeof(ext3_acl_entry)) - return -1; - return s / sizeof(ext3_acl_entry) + 4; - } -} - -#ifdef CONFIG_EXT3_FS_POSIX_ACL - -/* acl.c */ -extern struct posix_acl *ext3_get_acl(struct inode *inode, int type); -extern int ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type); -extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); - -#else /* CONFIG_EXT3_FS_POSIX_ACL */ -#include <linux/sched.h> -#define ext3_get_acl NULL -#define ext3_set_acl NULL - -static inline int -ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) -{ - return 0; -} -#endif /* CONFIG_EXT3_FS_POSIX_ACL */ - diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c deleted file mode 100644 index 158b5d4ce067..000000000000 --- a/fs/ext3/balloc.c +++ /dev/null @@ -1,2158 +0,0 @@ -/* - * linux/fs/ext3/balloc.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - */ - -#include <linux/quotaops.h> -#include <linux/blkdev.h> -#include "ext3.h" - -/* - * balloc.c contains the blocks allocation and deallocation routines - */ - -/* - * The free blocks are managed by bitmaps. A file system contains several - * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap - * block for inodes, N blocks for the inode table and data blocks. - * - * The file system contains group descriptors which are located after the - * super block. Each descriptor contains the number of the bitmap block and - * the free blocks count in the block. The descriptors are loaded in memory - * when a file system is mounted (see ext3_fill_super). - */ - - -#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) - -/* - * Calculate the block group number and offset, given a block number - */ -static void ext3_get_group_no_and_offset(struct super_block *sb, - ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp) -{ - struct ext3_super_block *es = EXT3_SB(sb)->s_es; - - blocknr = blocknr - le32_to_cpu(es->s_first_data_block); - if (offsetp) - *offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb); - if (blockgrpp) - *blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb); -} - -/** - * ext3_get_group_desc() -- load group descriptor from disk - * @sb: super block - * @block_group: given block group - * @bh: pointer to the buffer head to store the block - * group descriptor - */ -struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, - unsigned int block_group, - struct buffer_head ** bh) -{ - unsigned long group_desc; - unsigned long offset; - struct ext3_group_desc * desc; - struct ext3_sb_info *sbi = EXT3_SB(sb); - - if (block_group >= sbi->s_groups_count) { - ext3_error (sb, "ext3_get_group_desc", - "block_group >= groups_count - " - "block_group = %d, groups_count = %lu", - block_group, sbi->s_groups_count); - - return NULL; - } - smp_rmb(); - - group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb); - offset = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1); - if (!sbi->s_group_desc[group_desc]) { - ext3_error (sb, "ext3_get_group_desc", - "Group descriptor not loaded - " - "block_group = %d, group_desc = %lu, desc = %lu", - block_group, group_desc, offset); - return NULL; - } - - desc = (struct ext3_group_desc *) sbi->s_group_desc[group_desc]->b_data; - if (bh) - *bh = sbi->s_group_desc[group_desc]; - return desc + offset; -} - -static int ext3_valid_block_bitmap(struct super_block *sb, - struct ext3_group_desc *desc, - unsigned int block_group, - struct buffer_head *bh) -{ - ext3_grpblk_t offset; - ext3_grpblk_t next_zero_bit; - ext3_fsblk_t bitmap_blk; - ext3_fsblk_t group_first_block; - - group_first_block = ext3_group_first_block_no(sb, block_group); - - /* check whether block bitmap block number is set */ - bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); - offset = bitmap_blk - group_first_block; - if (!ext3_test_bit(offset, bh->b_data)) - /* bad block bitmap */ - goto err_out; - - /* check whether the inode bitmap block number is set */ - bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap); - offset = bitmap_blk - group_first_block; - if (!ext3_test_bit(offset, bh->b_data)) - /* bad block bitmap */ - goto err_out; - - /* check whether the inode table block number is set */ - bitmap_blk = le32_to_cpu(desc->bg_inode_table); - offset = bitmap_blk - group_first_block; - next_zero_bit = ext3_find_next_zero_bit(bh->b_data, - offset + EXT3_SB(sb)->s_itb_per_group, - offset); - if (next_zero_bit >= offset + EXT3_SB(sb)->s_itb_per_group) - /* good bitmap for inode tables */ - return 1; - -err_out: - ext3_error(sb, __func__, - "Invalid block bitmap - " - "block_group = %d, block = %lu", - block_group, bitmap_blk); - return 0; -} - -/** - * read_block_bitmap() - * @sb: super block - * @block_group: given block group - * - * Read the bitmap for a given block_group,and validate the - * bits for block/inode/inode tables are set in the bitmaps - * - * Return buffer_head on success or NULL in case of failure. - */ -static struct buffer_head * -read_block_bitmap(struct super_block *sb, unsigned int block_group) -{ - struct ext3_group_desc * desc; - struct buffer_head * bh = NULL; - ext3_fsblk_t bitmap_blk; - - desc = ext3_get_group_desc(sb, block_group, NULL); - if (!desc) - return NULL; - trace_ext3_read_block_bitmap(sb, block_group); - bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); - bh = sb_getblk(sb, bitmap_blk); - if (unlikely(!bh)) { - ext3_error(sb, __func__, - "Cannot read block bitmap - " - "block_group = %d, block_bitmap = %u", - block_group, le32_to_cpu(desc->bg_block_bitmap)); - return NULL; - } - if (likely(bh_uptodate_or_lock(bh))) - return bh; - - if (bh_submit_read(bh) < 0) { - brelse(bh); - ext3_error(sb, __func__, - "Cannot read block bitmap - " - "block_group = %d, block_bitmap = %u", - block_group, le32_to_cpu(desc->bg_block_bitmap)); - return NULL; - } - ext3_valid_block_bitmap(sb, desc, block_group, bh); - /* - * file system mounted not to panic on error, continue with corrupt - * bitmap - */ - return bh; -} -/* - * The reservation window structure operations - * -------------------------------------------- - * Operations include: - * dump, find, add, remove, is_empty, find_next_reservable_window, etc. - * - * We use a red-black tree to represent per-filesystem reservation - * windows. - * - */ - -/** - * __rsv_window_dump() -- Dump the filesystem block allocation reservation map - * @rb_root: root of per-filesystem reservation rb tree - * @verbose: verbose mode - * @fn: function which wishes to dump the reservation map - * - * If verbose is turned on, it will print the whole block reservation - * windows(start, end). Otherwise, it will only print out the "bad" windows, - * those windows that overlap with their immediate neighbors. - */ -#if 1 -static void __rsv_window_dump(struct rb_root *root, int verbose, - const char *fn) -{ - struct rb_node *n; - struct ext3_reserve_window_node *rsv, *prev; - int bad; - -restart: - n = rb_first(root); - bad = 0; - prev = NULL; - - printk("Block Allocation Reservation Windows Map (%s):\n", fn); - while (n) { - rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node); - if (verbose) - printk("reservation window 0x%p " - "start: %lu, end: %lu\n", - rsv, rsv->rsv_start, rsv->rsv_end); - if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) { - printk("Bad reservation %p (start >= end)\n", - rsv); - bad = 1; - } - if (prev && prev->rsv_end >= rsv->rsv_start) { - printk("Bad reservation %p (prev->end >= start)\n", - rsv); - bad = 1; - } - if (bad) { - if (!verbose) { - printk("Restarting reservation walk in verbose mode\n"); - verbose = 1; - goto restart; - } - } - n = rb_next(n); - prev = rsv; - } - printk("Window map complete.\n"); - BUG_ON(bad); -} -#define rsv_window_dump(root, verbose) \ - __rsv_window_dump((root), (verbose), __func__) -#else -#define rsv_window_dump(root, verbose) do {} while (0) -#endif - -/** - * goal_in_my_reservation() - * @rsv: inode's reservation window - * @grp_goal: given goal block relative to the allocation block group - * @group: the current allocation block group - * @sb: filesystem super block - * - * Test if the given goal block (group relative) is within the file's - * own block reservation window range. - * - * If the reservation window is outside the goal allocation group, return 0; - * grp_goal (given goal block) could be -1, which means no specific - * goal block. In this case, always return 1. - * If the goal block is within the reservation window, return 1; - * otherwise, return 0; - */ -static int -goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal, - unsigned int group, struct super_block * sb) -{ - ext3_fsblk_t group_first_block, group_last_block; - - group_first_block = ext3_group_first_block_no(sb, group); - group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1); - - if ((rsv->_rsv_start > group_last_block) || - (rsv->_rsv_end < group_first_block)) - return 0; - if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start) - || (grp_goal + group_first_block > rsv->_rsv_end))) - return 0; - return 1; -} - -/** - * search_reserve_window() - * @rb_root: root of reservation tree - * @goal: target allocation block - * - * Find the reserved window which includes the goal, or the previous one - * if the goal is not in any window. - * Returns NULL if there are no windows or if all windows start after the goal. - */ -static struct ext3_reserve_window_node * -search_reserve_window(struct rb_root *root, ext3_fsblk_t goal) -{ - struct rb_node *n = root->rb_node; - struct ext3_reserve_window_node *rsv; - - if (!n) - return NULL; - - do { - rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node); - - if (goal < rsv->rsv_start) - n = n->rb_left; - else if (goal > rsv->rsv_end) - n = n->rb_right; - else - return rsv; - } while (n); - /* - * We've fallen off the end of the tree: the goal wasn't inside - * any particular node. OK, the previous node must be to one - * side of the interval containing the goal. If it's the RHS, - * we need to back up one. - */ - if (rsv->rsv_start > goal) { - n = rb_prev(&rsv->rsv_node); - rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node); - } - return rsv; -} - -/** - * ext3_rsv_window_add() -- Insert a window to the block reservation rb tree. - * @sb: super block - * @rsv: reservation window to add - * - * Must be called with rsv_lock hold. - */ -void ext3_rsv_window_add(struct super_block *sb, - struct ext3_reserve_window_node *rsv) -{ - struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root; - struct rb_node *node = &rsv->rsv_node; - ext3_fsblk_t start = rsv->rsv_start; - - struct rb_node ** p = &root->rb_node; - struct rb_node * parent = NULL; - struct ext3_reserve_window_node *this; - - trace_ext3_rsv_window_add(sb, rsv); - while (*p) - { - parent = *p; - this = rb_entry(parent, struct ext3_reserve_window_node, rsv_node); - - if (start < this->rsv_start) - p = &(*p)->rb_left; - else if (start > this->rsv_end) - p = &(*p)->rb_right; - else { - rsv_window_dump(root, 1); - BUG(); - } - } - - rb_link_node(node, parent, p); - rb_insert_color(node, root); -} - -/** - * ext3_rsv_window_remove() -- unlink a window from the reservation rb tree - * @sb: super block - * @rsv: reservation window to remove - * - * Mark the block reservation window as not allocated, and unlink it - * from the filesystem reservation window rb tree. Must be called with - * rsv_lock hold. - */ -static void rsv_window_remove(struct super_block *sb, - struct ext3_reserve_window_node *rsv) -{ - rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; - rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; - rsv->rsv_alloc_hit = 0; - rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root); -} - -/* - * rsv_is_empty() -- Check if the reservation window is allocated. - * @rsv: given reservation window to check - * - * returns 1 if the end block is EXT3_RESERVE_WINDOW_NOT_ALLOCATED. - */ -static inline int rsv_is_empty(struct ext3_reserve_window *rsv) -{ - /* a valid reservation end block could not be 0 */ - return rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED; -} - -/** - * ext3_init_block_alloc_info() - * @inode: file inode structure - * - * Allocate and initialize the reservation window structure, and - * link the window to the ext3 inode structure at last - * - * The reservation window structure is only dynamically allocated - * and linked to ext3 inode the first time the open file - * needs a new block. So, before every ext3_new_block(s) call, for - * regular files, we should check whether the reservation window - * structure exists or not. In the latter case, this function is called. - * Fail to do so will result in block reservation being turned off for that - * open file. - * - * This function is called from ext3_get_blocks_handle(), also called - * when setting the reservation window size through ioctl before the file - * is open for write (needs block allocation). - * - * Needs truncate_mutex protection prior to call this function. - */ -void ext3_init_block_alloc_info(struct inode *inode) -{ - struct ext3_inode_info *ei = EXT3_I(inode); - struct ext3_block_alloc_info *block_i; - struct super_block *sb = inode->i_sb; - - block_i = kmalloc(sizeof(*block_i), GFP_NOFS); - if (block_i) { - struct ext3_reserve_window_node *rsv = &block_i->rsv_window_node; - - rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; - rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; - - /* - * if filesystem is mounted with NORESERVATION, the goal - * reservation window size is set to zero to indicate - * block reservation is off - */ - if (!test_opt(sb, RESERVATION)) - rsv->rsv_goal_size = 0; - else - rsv->rsv_goal_size = EXT3_DEFAULT_RESERVE_BLOCKS; - rsv->rsv_alloc_hit = 0; - block_i->last_alloc_logical_block = 0; - block_i->last_alloc_physical_block = 0; - } - ei->i_block_alloc_info = block_i; -} - -/** - * ext3_discard_reservation() - * @inode: inode - * - * Discard(free) block reservation window on last file close, or truncate - * or at last iput(). - * - * It is being called in three cases: - * ext3_release_file(): last writer close the file - * ext3_clear_inode(): last iput(), when nobody link to this file. - * ext3_truncate(): when the block indirect map is about to change. - * - */ -void ext3_discard_reservation(struct inode *inode) -{ - struct ext3_inode_info *ei = EXT3_I(inode); - struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info; - struct ext3_reserve_window_node *rsv; - spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock; - - if (!block_i) - return; - - rsv = &block_i->rsv_window_node; - if (!rsv_is_empty(&rsv->rsv_window)) { - spin_lock(rsv_lock); - if (!rsv_is_empty(&rsv->rsv_window)) { - trace_ext3_discard_reservation(inode, rsv); - rsv_window_remove(inode->i_sb, rsv); - } - spin_unlock(rsv_lock); - } -} - -/** - * ext3_free_blocks_sb() -- Free given blocks and update quota - * @handle: handle to this transaction - * @sb: super block - * @block: start physical block to free - * @count: number of blocks to free - * @pdquot_freed_blocks: pointer to quota - */ -void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb, - ext3_fsblk_t block, unsigned long count, - unsigned long *pdquot_freed_blocks) -{ - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *gd_bh; - unsigned long block_group; - ext3_grpblk_t bit; - unsigned long i; - unsigned long overflow; - struct ext3_group_desc * desc; - struct ext3_super_block * es; - struct ext3_sb_info *sbi; - int err = 0, ret; - ext3_grpblk_t group_freed; - - *pdquot_freed_blocks = 0; - sbi = EXT3_SB(sb); - es = sbi->s_es; - if (block < le32_to_cpu(es->s_first_data_block) || - block + count < block || - block + count > le32_to_cpu(es->s_blocks_count)) { - ext3_error (sb, "ext3_free_blocks", - "Freeing blocks not in datazone - " - "block = "E3FSBLK", count = %lu", block, count); - goto error_return; - } - - ext3_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1); - -do_more: - overflow = 0; - block_group = (block - le32_to_cpu(es->s_first_data_block)) / - EXT3_BLOCKS_PER_GROUP(sb); - bit = (block - le32_to_cpu(es->s_first_data_block)) % - EXT3_BLOCKS_PER_GROUP(sb); - /* - * Check to see if we are freeing blocks across a group - * boundary. - */ - if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { - overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); - count -= overflow; - } - brelse(bitmap_bh); - bitmap_bh = read_block_bitmap(sb, block_group); - if (!bitmap_bh) - goto error_return; - desc = ext3_get_group_desc (sb, block_group, &gd_bh); - if (!desc) - goto error_return; - - if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) || - in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) || - in_range (block, le32_to_cpu(desc->bg_inode_table), - sbi->s_itb_per_group) || - in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table), - sbi->s_itb_per_group)) { - ext3_error (sb, "ext3_free_blocks", - "Freeing blocks in system zones - " - "Block = "E3FSBLK", count = %lu", - block, count); - goto error_return; - } - - /* - * We are about to start releasing blocks in the bitmap, - * so we need undo access. - */ - /* @@@ check errors */ - BUFFER_TRACE(bitmap_bh, "getting undo access"); - err = ext3_journal_get_undo_access(handle, bitmap_bh); - if (err) - goto error_return; - - /* - * We are about to modify some metadata. Call the journal APIs - * to unshare ->b_data if a currently-committing transaction is - * using it - */ - BUFFER_TRACE(gd_bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, gd_bh); - if (err) - goto error_return; - - jbd_lock_bh_state(bitmap_bh); - - for (i = 0, group_freed = 0; i < count; i++) { - /* - * An HJ special. This is expensive... - */ -#ifdef CONFIG_JBD_DEBUG - jbd_unlock_bh_state(bitmap_bh); - { - struct buffer_head *debug_bh; - debug_bh = sb_find_get_block(sb, block + i); - if (debug_bh) { - BUFFER_TRACE(debug_bh, "Deleted!"); - if (!bh2jh(bitmap_bh)->b_committed_data) - BUFFER_TRACE(debug_bh, - "No committed data in bitmap"); - BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap"); - __brelse(debug_bh); - } - } - jbd_lock_bh_state(bitmap_bh); -#endif - if (need_resched()) { - jbd_unlock_bh_state(bitmap_bh); - cond_resched(); - jbd_lock_bh_state(bitmap_bh); - } - /* @@@ This prevents newly-allocated data from being - * freed and then reallocated within the same - * transaction. - * - * Ideally we would want to allow that to happen, but to - * do so requires making journal_forget() capable of - * revoking the queued write of a data block, which - * implies blocking on the journal lock. *forget() - * cannot block due to truncate races. - * - * Eventually we can fix this by making journal_forget() - * return a status indicating whether or not it was able - * to revoke the buffer. On successful revoke, it is - * safe not to set the allocation bit in the committed - * bitmap, because we know that there is no outstanding - * activity on the buffer any more and so it is safe to - * reallocate it. - */ - BUFFER_TRACE(bitmap_bh, "set in b_committed_data"); - J_ASSERT_BH(bitmap_bh, - bh2jh(bitmap_bh)->b_committed_data != NULL); - ext3_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i, - bh2jh(bitmap_bh)->b_committed_data); - - /* - * We clear the bit in the bitmap after setting the committed - * data bit, because this is the reverse order to that which - * the allocator uses. - */ - BUFFER_TRACE(bitmap_bh, "clear bit"); - if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group), - bit + i, bitmap_bh->b_data)) { - jbd_unlock_bh_state(bitmap_bh); - ext3_error(sb, __func__, - "bit already cleared for block "E3FSBLK, - block + i); - jbd_lock_bh_state(bitmap_bh); - BUFFER_TRACE(bitmap_bh, "bit already cleared"); - } else { - group_freed++; - } - } - jbd_unlock_bh_state(bitmap_bh); - - spin_lock(sb_bgl_lock(sbi, block_group)); - le16_add_cpu(&desc->bg_free_blocks_count, group_freed); - spin_unlock(sb_bgl_lock(sbi, block_group)); - percpu_counter_add(&sbi->s_freeblocks_counter, count); - - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext3_journal_dirty_metadata(handle, bitmap_bh); - - /* And the group descriptor block */ - BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext3_journal_dirty_metadata(handle, gd_bh); - if (!err) err = ret; - *pdquot_freed_blocks += group_freed; - - if (overflow && !err) { - block += count; - count = overflow; - goto do_more; - } - -error_return: - brelse(bitmap_bh); - ext3_std_error(sb, err); - return; -} - -/** - * ext3_free_blocks() -- Free given blocks and update quota - * @handle: handle for this transaction - * @inode: inode - * @block: start physical block to free - * @count: number of blocks to count - */ -void ext3_free_blocks(handle_t *handle, struct inode *inode, - ext3_fsblk_t block, unsigned long count) -{ - struct super_block *sb = inode->i_sb; - unsigned long dquot_freed_blocks; - - trace_ext3_free_blocks(inode, block, count); - ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); - if (dquot_freed_blocks) - dquot_free_block(inode, dquot_freed_blocks); - return; -} - -/** - * ext3_test_allocatable() - * @nr: given allocation block group - * @bh: bufferhead contains the bitmap of the given block group - * - * For ext3 allocations, we must not reuse any blocks which are - * allocated in the bitmap buffer's "last committed data" copy. This - * prevents deletes from freeing up the page for reuse until we have - * committed the delete transaction. - * - * If we didn't do this, then deleting something and reallocating it as - * data would allow the old block to be overwritten before the - * transaction committed (because we force data to disk before commit). - * This would lead to corruption if we crashed between overwriting the - * data and committing the delete. - * - * @@@ We may want to make this allocation behaviour conditional on - * data-writes at some point, and disable it for metadata allocations or - * sync-data inodes. - */ -static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh) -{ - int ret; - struct journal_head *jh = bh2jh(bh); - - if (ext3_test_bit(nr, bh->b_data)) - return 0; - - jbd_lock_bh_state(bh); - if (!jh->b_committed_data) - ret = 1; - else - ret = !ext3_test_bit(nr, jh->b_committed_data); - jbd_unlock_bh_state(bh); - return ret; -} - -/** - * bitmap_search_next_usable_block() - * @start: the starting block (group relative) of the search - * @bh: bufferhead contains the block group bitmap - * @maxblocks: the ending block (group relative) of the reservation - * - * The bitmap search --- search forward alternately through the actual - * bitmap on disk and the last-committed copy in journal, until we find a - * bit free in both bitmaps. - */ -static ext3_grpblk_t -bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh, - ext3_grpblk_t maxblocks) -{ - ext3_grpblk_t next; - struct journal_head *jh = bh2jh(bh); - - while (start < maxblocks) { - next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start); - if (next >= maxblocks) - return -1; - if (ext3_test_allocatable(next, bh)) - return next; - jbd_lock_bh_state(bh); - if (jh->b_committed_data) - start = ext3_find_next_zero_bit(jh->b_committed_data, - maxblocks, next); - jbd_unlock_bh_state(bh); - } - return -1; -} - -/** - * find_next_usable_block() - * @start: the starting block (group relative) to find next - * allocatable block in bitmap. - * @bh: bufferhead contains the block group bitmap - * @maxblocks: the ending block (group relative) for the search - * - * Find an allocatable block in a bitmap. We honor both the bitmap and - * its last-committed copy (if that exists), and perform the "most - * appropriate allocation" algorithm of looking for a free block near - * the initial goal; then for a free byte somewhere in the bitmap; then - * for any free bit in the bitmap. - */ -static ext3_grpblk_t -find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh, - ext3_grpblk_t maxblocks) -{ - ext3_grpblk_t here, next; - char *p, *r; - - if (start > 0) { - /* - * The goal was occupied; search forward for a free - * block within the next XX blocks. - * - * end_goal is more or less random, but it has to be - * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the - * next 64-bit boundary is simple.. - */ - ext3_grpblk_t end_goal = (start + 63) & ~63; - if (end_goal > maxblocks) - end_goal = maxblocks; - here = ext3_find_next_zero_bit(bh->b_data, end_goal, start); - if (here < end_goal && ext3_test_allocatable(here, bh)) - return here; - ext3_debug("Bit not found near goal\n"); - } - - here = start; - if (here < 0) - here = 0; - - p = bh->b_data + (here >> 3); - r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3)); - next = (r - bh->b_data) << 3; - - if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh)) - return next; - - /* - * The bitmap search --- search forward alternately through the actual - * bitmap and the last-committed copy until we find a bit free in - * both - */ - here = bitmap_search_next_usable_block(here, bh, maxblocks); - return here; -} - -/** - * claim_block() - * @lock: the spin lock for this block group - * @block: the free block (group relative) to allocate - * @bh: the buffer_head contains the block group bitmap - * - * We think we can allocate this block in this bitmap. Try to set the bit. - * If that succeeds then check that nobody has allocated and then freed the - * block since we saw that is was not marked in b_committed_data. If it _was_ - * allocated and freed then clear the bit in the bitmap again and return - * zero (failure). - */ -static inline int -claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh) -{ - struct journal_head *jh = bh2jh(bh); - int ret; - - if (ext3_set_bit_atomic(lock, block, bh->b_data)) - return 0; - jbd_lock_bh_state(bh); - if (jh->b_committed_data && ext3_test_bit(block,jh->b_committed_data)) { - ext3_clear_bit_atomic(lock, block, bh->b_data); - ret = 0; - } else { - ret = 1; - } - jbd_unlock_bh_state(bh); - return ret; -} - -/** - * ext3_try_to_allocate() - * @sb: superblock - * @handle: handle to this transaction - * @group: given allocation block group - * @bitmap_bh: bufferhead holds the block bitmap - * @grp_goal: given target block within the group - * @count: target number of blocks to allocate - * @my_rsv: reservation window - * - * Attempt to allocate blocks within a give range. Set the range of allocation - * first, then find the first free bit(s) from the bitmap (within the range), - * and at last, allocate the blocks by claiming the found free bit as allocated. - * - * To set the range of this allocation: - * if there is a reservation window, only try to allocate block(s) from the - * file's own reservation window; - * Otherwise, the allocation range starts from the give goal block, ends at - * the block group's last block. - * - * If we failed to allocate the desired block then we may end up crossing to a - * new bitmap. In that case we must release write access to the old one via - * ext3_journal_release_buffer(), else we'll run out of credits. - */ -static ext3_grpblk_t -ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group, - struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal, - unsigned long *count, struct ext3_reserve_window *my_rsv) -{ - ext3_fsblk_t group_first_block; - ext3_grpblk_t start, end; - unsigned long num = 0; - - /* we do allocation within the reservation window if we have a window */ - if (my_rsv) { - group_first_block = ext3_group_first_block_no(sb, group); - if (my_rsv->_rsv_start >= group_first_block) - start = my_rsv->_rsv_start - group_first_block; - else - /* reservation window cross group boundary */ - start = 0; - end = my_rsv->_rsv_end - group_first_block + 1; - if (end > EXT3_BLOCKS_PER_GROUP(sb)) - /* reservation window crosses group boundary */ - end = EXT3_BLOCKS_PER_GROUP(sb); - if ((start <= grp_goal) && (grp_goal < end)) - start = grp_goal; - else - grp_goal = -1; - } else { - if (grp_goal > 0) - start = grp_goal; - else - start = 0; - end = EXT3_BLOCKS_PER_GROUP(sb); - } - - BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb)); - -repeat: - if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) { - grp_goal = find_next_usable_block(start, bitmap_bh, end); - if (grp_goal < 0) - goto fail_access; - if (!my_rsv) { - int i; - - for (i = 0; i < 7 && grp_goal > start && - ext3_test_allocatable(grp_goal - 1, - bitmap_bh); - i++, grp_goal--) - ; - } - } - start = grp_goal; - - if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), - grp_goal, bitmap_bh)) { - /* - * The block was allocated by another thread, or it was - * allocated and then freed by another thread - */ - start++; - grp_goal++; - if (start >= end) - goto fail_access; - goto repeat; - } - num++; - grp_goal++; - while (num < *count && grp_goal < end - && ext3_test_allocatable(grp_goal, bitmap_bh) - && claim_block(sb_bgl_lock(EXT3_SB(sb), group), - grp_goal, bitmap_bh)) { - num++; - grp_goal++; - } - *count = num; - return grp_goal - num; -fail_access: - *count = num; - return -1; -} - -/** - * find_next_reservable_window(): - * find a reservable space within the given range. - * It does not allocate the reservation window for now: - * alloc_new_reservation() will do the work later. - * - * @search_head: the head of the searching list; - * This is not necessarily the list head of the whole filesystem - * - * We have both head and start_block to assist the search - * for the reservable space. The list starts from head, - * but we will shift to the place where start_block is, - * then start from there, when looking for a reservable space. - * - * @my_rsv: the reservation window - * - * @sb: the super block - * - * @start_block: the first block we consider to start - * the real search from - * - * @last_block: - * the maximum block number that our goal reservable space - * could start from. This is normally the last block in this - * group. The search will end when we found the start of next - * possible reservable space is out of this boundary. - * This could handle the cross boundary reservation window - * request. - * - * basically we search from the given range, rather than the whole - * reservation double linked list, (start_block, last_block) - * to find a free region that is of my size and has not - * been reserved. - * - */ -static int find_next_reservable_window( - struct ext3_reserve_window_node *search_head, - struct ext3_reserve_window_node *my_rsv, - struct super_block * sb, - ext3_fsblk_t start_block, - ext3_fsblk_t last_block) -{ - struct rb_node *next; - struct ext3_reserve_window_node *rsv, *prev; - ext3_fsblk_t cur; - int size = my_rsv->rsv_goal_size; - - /* TODO: make the start of the reservation window byte-aligned */ - /* cur = *start_block & ~7;*/ - cur = start_block; - rsv = search_head; - if (!rsv) - return -1; - - while (1) { - if (cur <= rsv->rsv_end) - cur = rsv->rsv_end + 1; - - /* TODO? - * in the case we could not find a reservable space - * that is what is expected, during the re-search, we could - * remember what's the largest reservable space we could have - * and return that one. - * - * For now it will fail if we could not find the reservable - * space with expected-size (or more)... - */ - if (cur > last_block) - return -1; /* fail */ - - prev = rsv; - next = rb_next(&rsv->rsv_node); - rsv = rb_entry(next,struct ext3_reserve_window_node,rsv_node); - - /* - * Reached the last reservation, we can just append to the - * previous one. - */ - if (!next) - break; - - if (cur + size <= rsv->rsv_start) { - /* - * Found a reserveable space big enough. We could - * have a reservation across the group boundary here - */ - break; - } - } - /* - * we come here either : - * when we reach the end of the whole list, - * and there is empty reservable space after last entry in the list. - * append it to the end of the list. - * - * or we found one reservable space in the middle of the list, - * return the reservation window that we could append to. - * succeed. - */ - - if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window))) - rsv_window_remove(sb, my_rsv); - - /* - * Let's book the whole available window for now. We will check the - * disk bitmap later and then, if there are free blocks then we adjust - * the window size if it's larger than requested. - * Otherwise, we will remove this node from the tree next time - * call find_next_reservable_window. - */ - my_rsv->rsv_start = cur; - my_rsv->rsv_end = cur + size - 1; - my_rsv->rsv_alloc_hit = 0; - - if (prev != my_rsv) - ext3_rsv_window_add(sb, my_rsv); - - return 0; -} - -/** - * alloc_new_reservation()--allocate a new reservation window - * - * To make a new reservation, we search part of the filesystem - * reservation list (the list that inside the group). We try to - * allocate a new reservation window near the allocation goal, - * or the beginning of the group, if there is no goal. - * - * We first find a reservable space after the goal, then from - * there, we check the bitmap for the first free block after - * it. If there is no free block until the end of group, then the - * whole group is full, we failed. Otherwise, check if the free - * block is inside the expected reservable space, if so, we - * succeed. - * If the first free block is outside the reservable space, then - * start from the first free block, we search for next available - * space, and go on. - * - * on succeed, a new reservation will be found and inserted into the list - * It contains at least one free block, and it does not overlap with other - * reservation windows. - * - * failed: we failed to find a reservation window in this group - * - * @my_rsv: the reservation window - * - * @grp_goal: The goal (group-relative). It is where the search for a - * free reservable space should start from. - * if we have a grp_goal(grp_goal >0 ), then start from there, - * no grp_goal(grp_goal = -1), we start from the first block - * of the group. - * - * @sb: the super block - * @group: the group we are trying to allocate in - * @bitmap_bh: the block group block bitmap - * - */ -static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, - ext3_grpblk_t grp_goal, struct super_block *sb, - unsigned int group, struct buffer_head *bitmap_bh) -{ - struct ext3_reserve_window_node *search_head; - ext3_fsblk_t group_first_block, group_end_block, start_block; - ext3_grpblk_t first_free_block; - struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root; - unsigned long size; - int ret; - spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; - - group_first_block = ext3_group_first_block_no(sb, group); - group_end_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1); - - if (grp_goal < 0) - start_block = group_first_block; - else - start_block = grp_goal + group_first_block; - - trace_ext3_alloc_new_reservation(sb, start_block); - size = my_rsv->rsv_goal_size; - - if (!rsv_is_empty(&my_rsv->rsv_window)) { - /* - * if the old reservation is cross group boundary - * and if the goal is inside the old reservation window, - * we will come here when we just failed to allocate from - * the first part of the window. We still have another part - * that belongs to the next group. In this case, there is no - * point to discard our window and try to allocate a new one - * in this group(which will fail). we should - * keep the reservation window, just simply move on. - * - * Maybe we could shift the start block of the reservation - * window to the first block of next group. - */ - - if ((my_rsv->rsv_start <= group_end_block) && - (my_rsv->rsv_end > group_end_block) && - (start_block >= my_rsv->rsv_start)) - return -1; - - if ((my_rsv->rsv_alloc_hit > - (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) { - /* - * if the previously allocation hit ratio is - * greater than 1/2, then we double the size of - * the reservation window the next time, - * otherwise we keep the same size window - */ - size = size * 2; - if (size > EXT3_MAX_RESERVE_BLOCKS) - size = EXT3_MAX_RESERVE_BLOCKS; - my_rsv->rsv_goal_size= size; - } - } - - spin_lock(rsv_lock); - /* - * shift the search start to the window near the goal block - */ - search_head = search_reserve_window(fs_rsv_root, start_block); - - /* - * find_next_reservable_window() simply finds a reservable window - * inside the given range(start_block, group_end_block). - * - * To make sure the reservation window has a free bit inside it, we - * need to check the bitmap after we found a reservable window. - */ -retry: - ret = find_next_reservable_window(search_head, my_rsv, sb, - start_block, group_end_block); - - if (ret == -1) { - if (!rsv_is_empty(&my_rsv->rsv_window)) - rsv_window_remove(sb, my_rsv); - spin_unlock(rsv_lock); - return -1; - } - - /* - * On success, find_next_reservable_window() returns the - * reservation window where there is a reservable space after it. - * Before we reserve this reservable space, we need - * to make sure there is at least a free block inside this region. - * - * searching the first free bit on the block bitmap and copy of - * last committed bitmap alternatively, until we found a allocatable - * block. Search start from the start block of the reservable space - * we just found. - */ - spin_unlock(rsv_lock); - first_free_block = bitmap_search_next_usable_block( - my_rsv->rsv_start - group_first_block, - bitmap_bh, group_end_block - group_first_block + 1); - - if (first_free_block < 0) { - /* - * no free block left on the bitmap, no point - * to reserve the space. return failed. - */ - spin_lock(rsv_lock); - if (!rsv_is_empty(&my_rsv->rsv_window)) - rsv_window_remove(sb, my_rsv); - spin_unlock(rsv_lock); - return -1; /* failed */ - } - - start_block = first_free_block + group_first_block; - /* - * check if the first free block is within the - * free space we just reserved - */ - if (start_block >= my_rsv->rsv_start && - start_block <= my_rsv->rsv_end) { - trace_ext3_reserved(sb, start_block, my_rsv); - return 0; /* success */ - } - /* - * if the first free bit we found is out of the reservable space - * continue search for next reservable space, - * start from where the free block is, - * we also shift the list head to where we stopped last time - */ - search_head = my_rsv; - spin_lock(rsv_lock); - goto retry; -} - -/** - * try_to_extend_reservation() - * @my_rsv: given reservation window - * @sb: super block - * @size: the delta to extend - * - * Attempt to expand the reservation window large enough to have - * required number of free blocks - * - * Since ext3_try_to_allocate() will always allocate blocks within - * the reservation window range, if the window size is too small, - * multiple blocks allocation has to stop at the end of the reservation - * window. To make this more efficient, given the total number of - * blocks needed and the current size of the window, we try to - * expand the reservation window size if necessary on a best-effort - * basis before ext3_new_blocks() tries to allocate blocks, - */ -static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv, - struct super_block *sb, int size) -{ - struct ext3_reserve_window_node *next_rsv; - struct rb_node *next; - spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; - - if (!spin_trylock(rsv_lock)) - return; - - next = rb_next(&my_rsv->rsv_node); - - if (!next) - my_rsv->rsv_end += size; - else { - next_rsv = rb_entry(next, struct ext3_reserve_window_node, rsv_node); - - if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size) - my_rsv->rsv_end += size; - else - my_rsv->rsv_end = next_rsv->rsv_start - 1; - } - spin_unlock(rsv_lock); -} - -/** - * ext3_try_to_allocate_with_rsv() - * @sb: superblock - * @handle: handle to this transaction - * @group: given allocation block group - * @bitmap_bh: bufferhead holds the block bitmap - * @grp_goal: given target block within the group - * @my_rsv: reservation window - * @count: target number of blocks to allocate - * @errp: pointer to store the error code - * - * This is the main function used to allocate a new block and its reservation - * window. - * - * Each time when a new block allocation is need, first try to allocate from - * its own reservation. If it does not have a reservation window, instead of - * looking for a free bit on bitmap first, then look up the reservation list to - * see if it is inside somebody else's reservation window, we try to allocate a - * reservation window for it starting from the goal first. Then do the block - * allocation within the reservation window. - * - * This will avoid keeping on searching the reservation list again and - * again when somebody is looking for a free block (without - * reservation), and there are lots of free blocks, but they are all - * being reserved. - * - * We use a red-black tree for the per-filesystem reservation list. - * - */ -static ext3_grpblk_t -ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, - unsigned int group, struct buffer_head *bitmap_bh, - ext3_grpblk_t grp_goal, - struct ext3_reserve_window_node * my_rsv, - unsigned long *count, int *errp) -{ - ext3_fsblk_t group_first_block, group_last_block; - ext3_grpblk_t ret = 0; - int fatal; - unsigned long num = *count; - - *errp = 0; - - /* - * Make sure we use undo access for the bitmap, because it is critical - * that we do the frozen_data COW on bitmap buffers in all cases even - * if the buffer is in BJ_Forget state in the committing transaction. - */ - BUFFER_TRACE(bitmap_bh, "get undo access for new block"); - fatal = ext3_journal_get_undo_access(handle, bitmap_bh); - if (fatal) { - *errp = fatal; - return -1; - } - - /* - * we don't deal with reservation when - * filesystem is mounted without reservation - * or the file is not a regular file - * or last attempt to allocate a block with reservation turned on failed - */ - if (my_rsv == NULL ) { - ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, - grp_goal, count, NULL); - goto out; - } - /* - * grp_goal is a group relative block number (if there is a goal) - * 0 <= grp_goal < EXT3_BLOCKS_PER_GROUP(sb) - * first block is a filesystem wide block number - * first block is the block number of the first block in this group - */ - group_first_block = ext3_group_first_block_no(sb, group); - group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1); - - /* - * Basically we will allocate a new block from inode's reservation - * window. - * - * We need to allocate a new reservation window, if: - * a) inode does not have a reservation window; or - * b) last attempt to allocate a block from existing reservation - * failed; or - * c) we come here with a goal and with a reservation window - * - * We do not need to allocate a new reservation window if we come here - * at the beginning with a goal and the goal is inside the window, or - * we don't have a goal but already have a reservation window. - * then we could go to allocate from the reservation window directly. - */ - while (1) { - if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) || - !goal_in_my_reservation(&my_rsv->rsv_window, - grp_goal, group, sb)) { - if (my_rsv->rsv_goal_size < *count) - my_rsv->rsv_goal_size = *count; - ret = alloc_new_reservation(my_rsv, grp_goal, sb, - group, bitmap_bh); - if (ret < 0) - break; /* failed */ - - if (!goal_in_my_reservation(&my_rsv->rsv_window, - grp_goal, group, sb)) - grp_goal = -1; - } else if (grp_goal >= 0) { - int curr = my_rsv->rsv_end - - (grp_goal + group_first_block) + 1; - - if (curr < *count) - try_to_extend_reservation(my_rsv, sb, - *count - curr); - } - - if ((my_rsv->rsv_start > group_last_block) || - (my_rsv->rsv_end < group_first_block)) { - rsv_window_dump(&EXT3_SB(sb)->s_rsv_window_root, 1); - BUG(); - } - ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, - grp_goal, &num, &my_rsv->rsv_window); - if (ret >= 0) { - my_rsv->rsv_alloc_hit += num; - *count = num; - break; /* succeed */ - } - num = *count; - } -out: - if (ret >= 0) { - BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for " - "bitmap block"); - fatal = ext3_journal_dirty_metadata(handle, bitmap_bh); - if (fatal) { - *errp = fatal; - return -1; - } - return ret; - } - - BUFFER_TRACE(bitmap_bh, "journal_release_buffer"); - ext3_journal_release_buffer(handle, bitmap_bh); - return ret; -} - -/** - * ext3_has_free_blocks() - * @sbi: in-core super block structure. - * - * Check if filesystem has at least 1 free block available for allocation. - */ -static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation) -{ - ext3_fsblk_t free_blocks, root_blocks; - - free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); - root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); - if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && - !use_reservation && !uid_eq(sbi->s_resuid, current_fsuid()) && - (gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) || - !in_group_p (sbi->s_resgid))) { - return 0; - } - return 1; -} - -/** - * ext3_should_retry_alloc() - * @sb: super block - * @retries number of attemps has been made - * - * ext3_should_retry_alloc() is called when ENOSPC is returned, and if - * it is profitable to retry the operation, this function will wait - * for the current or committing transaction to complete, and then - * return TRUE. - * - * if the total number of retries exceed three times, return FALSE. - */ -int ext3_should_retry_alloc(struct super_block *sb, int *retries) -{ - if (!ext3_has_free_blocks(EXT3_SB(sb), 0) || (*retries)++ > 3) - return 0; - - jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); - - return journal_force_commit_nested(EXT3_SB(sb)->s_journal); -} - -/** - * ext3_new_blocks() -- core block(s) allocation function - * @handle: handle to this transaction - * @inode: file inode - * @goal: given target block(filesystem wide) - * @count: target number of blocks to allocate - * @errp: error code - * - * ext3_new_blocks uses a goal block to assist allocation. It tries to - * allocate block(s) from the block group contains the goal block first. If that - * fails, it will try to allocate block(s) from other block groups without - * any specific goal block. - * - */ -ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, - ext3_fsblk_t goal, unsigned long *count, int *errp) -{ - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *gdp_bh; - int group_no; - int goal_group; - ext3_grpblk_t grp_target_blk; /* blockgroup relative goal block */ - ext3_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/ - ext3_fsblk_t ret_block; /* filesyetem-wide allocated block */ - int bgi; /* blockgroup iteration index */ - int fatal = 0, err; - int performed_allocation = 0; - ext3_grpblk_t free_blocks; /* number of free blocks in a group */ - struct super_block *sb; - struct ext3_group_desc *gdp; - struct ext3_super_block *es; - struct ext3_sb_info *sbi; - struct ext3_reserve_window_node *my_rsv = NULL; - struct ext3_block_alloc_info *block_i; - unsigned short windowsz = 0; -#ifdef EXT3FS_DEBUG - static int goal_hits, goal_attempts; -#endif - unsigned long ngroups; - unsigned long num = *count; - - *errp = -ENOSPC; - sb = inode->i_sb; - - /* - * Check quota for allocation of this block. - */ - err = dquot_alloc_block(inode, num); - if (err) { - *errp = err; - return 0; - } - - trace_ext3_request_blocks(inode, goal, num); - - sbi = EXT3_SB(sb); - es = sbi->s_es; - ext3_debug("goal=%lu.\n", goal); - /* - * Allocate a block from reservation only when - * filesystem is mounted with reservation(default,-o reservation), and - * it's a regular file, and - * the desired window size is greater than 0 (One could use ioctl - * command EXT3_IOC_SETRSVSZ to set the window size to 0 to turn off - * reservation on that particular file) - */ - block_i = EXT3_I(inode)->i_block_alloc_info; - if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) - my_rsv = &block_i->rsv_window_node; - - if (!ext3_has_free_blocks(sbi, IS_NOQUOTA(inode))) { - *errp = -ENOSPC; - goto out; - } - - /* - * First, test whether the goal block is free. - */ - if (goal < le32_to_cpu(es->s_first_data_block) || - goal >= le32_to_cpu(es->s_blocks_count)) - goal = le32_to_cpu(es->s_first_data_block); - group_no = (goal - le32_to_cpu(es->s_first_data_block)) / - EXT3_BLOCKS_PER_GROUP(sb); - goal_group = group_no; -retry_alloc: - gdp = ext3_get_group_desc(sb, group_no, &gdp_bh); - if (!gdp) - goto io_error; - - free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); - /* - * if there is not enough free blocks to make a new resevation - * turn off reservation for this allocation - */ - if (my_rsv && (free_blocks < windowsz) - && (free_blocks > 0) - && (rsv_is_empty(&my_rsv->rsv_window))) - my_rsv = NULL; - - if (free_blocks > 0) { - grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) % - EXT3_BLOCKS_PER_GROUP(sb)); - bitmap_bh = read_block_bitmap(sb, group_no); - if (!bitmap_bh) - goto io_error; - grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle, - group_no, bitmap_bh, grp_target_blk, - my_rsv, &num, &fatal); - if (fatal) - goto out; - if (grp_alloc_blk >= 0) - goto allocated; - } - - ngroups = EXT3_SB(sb)->s_groups_count; - smp_rmb(); - - /* - * Now search the rest of the groups. We assume that - * group_no and gdp correctly point to the last group visited. - */ - for (bgi = 0; bgi < ngroups; bgi++) { - group_no++; - if (group_no >= ngroups) - group_no = 0; - gdp = ext3_get_group_desc(sb, group_no, &gdp_bh); - if (!gdp) - goto io_error; - free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); - /* - * skip this group (and avoid loading bitmap) if there - * are no free blocks - */ - if (!free_blocks) - continue; - /* - * skip this group if the number of - * free blocks is less than half of the reservation - * window size. - */ - if (my_rsv && (free_blocks <= (windowsz/2))) - continue; - - brelse(bitmap_bh); - bitmap_bh = read_block_bitmap(sb, group_no); - if (!bitmap_bh) - goto io_error; - /* - * try to allocate block(s) from this group, without a goal(-1). - */ - grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle, - group_no, bitmap_bh, -1, my_rsv, - &num, &fatal); - if (fatal) - goto out; - if (grp_alloc_blk >= 0) - goto allocated; - } - /* - * We may end up a bogus earlier ENOSPC error due to - * filesystem is "full" of reservations, but - * there maybe indeed free blocks available on disk - * In this case, we just forget about the reservations - * just do block allocation as without reservations. - */ - if (my_rsv) { - my_rsv = NULL; - windowsz = 0; - group_no = goal_group; - goto retry_alloc; - } - /* No space left on the device */ - *errp = -ENOSPC; - goto out; - -allocated: - - ext3_debug("using block group %d(%d)\n", - group_no, gdp->bg_free_blocks_count); - - BUFFER_TRACE(gdp_bh, "get_write_access"); - fatal = ext3_journal_get_write_access(handle, gdp_bh); - if (fatal) - goto out; - - ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no); - - if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) || - in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) || - in_range(ret_block, le32_to_cpu(gdp->bg_inode_table), - EXT3_SB(sb)->s_itb_per_group) || - in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table), - EXT3_SB(sb)->s_itb_per_group)) { - ext3_error(sb, "ext3_new_block", - "Allocating block in system zone - " - "blocks from "E3FSBLK", length %lu", - ret_block, num); - /* - * claim_block() marked the blocks we allocated as in use. So we - * may want to selectively mark some of the blocks as free. - */ - goto retry_alloc; - } - - performed_allocation = 1; - -#ifdef CONFIG_JBD_DEBUG - { - struct buffer_head *debug_bh; - - /* Record bitmap buffer state in the newly allocated block */ - debug_bh = sb_find_get_block(sb, ret_block); - if (debug_bh) { - BUFFER_TRACE(debug_bh, "state when allocated"); - BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state"); - brelse(debug_bh); - } - } - jbd_lock_bh_state(bitmap_bh); - spin_lock(sb_bgl_lock(sbi, group_no)); - if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) { - int i; - - for (i = 0; i < num; i++) { - if (ext3_test_bit(grp_alloc_blk+i, - bh2jh(bitmap_bh)->b_committed_data)) { - printk("%s: block was unexpectedly set in " - "b_committed_data\n", __func__); - } - } - } - ext3_debug("found bit %d\n", grp_alloc_blk); - spin_unlock(sb_bgl_lock(sbi, group_no)); - jbd_unlock_bh_state(bitmap_bh); -#endif - - if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) { - ext3_error(sb, "ext3_new_block", - "block("E3FSBLK") >= blocks count(%d) - " - "block_group = %d, es == %p ", ret_block, - le32_to_cpu(es->s_blocks_count), group_no, es); - goto out; - } - - /* - * It is up to the caller to add the new buffer to a journal - * list of some description. We don't know in advance whether - * the caller wants to use it as metadata or data. - */ - ext3_debug("allocating block %lu. Goal hits %d of %d.\n", - ret_block, goal_hits, goal_attempts); - - spin_lock(sb_bgl_lock(sbi, group_no)); - le16_add_cpu(&gdp->bg_free_blocks_count, -num); - spin_unlock(sb_bgl_lock(sbi, group_no)); - percpu_counter_sub(&sbi->s_freeblocks_counter, num); - - BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); - fatal = ext3_journal_dirty_metadata(handle, gdp_bh); - if (fatal) - goto out; - - *errp = 0; - brelse(bitmap_bh); - - if (num < *count) { - dquot_free_block(inode, *count-num); - *count = num; - } - - trace_ext3_allocate_blocks(inode, goal, num, - (unsigned long long)ret_block); - - return ret_block; - -io_error: - *errp = -EIO; -out: - if (fatal) { - *errp = fatal; - ext3_std_error(sb, fatal); - } - /* - * Undo the block allocation - */ - if (!performed_allocation) - dquot_free_block(inode, *count); - brelse(bitmap_bh); - return 0; -} - -ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode, - ext3_fsblk_t goal, int *errp) -{ - unsigned long count = 1; - - return ext3_new_blocks(handle, inode, goal, &count, errp); -} - -/** - * ext3_count_free_blocks() -- count filesystem free blocks - * @sb: superblock - * - * Adds up the number of free blocks from each block group. - */ -ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb) -{ - ext3_fsblk_t desc_count; - struct ext3_group_desc *gdp; - int i; - unsigned long ngroups = EXT3_SB(sb)->s_groups_count; -#ifdef EXT3FS_DEBUG - struct ext3_super_block *es; - ext3_fsblk_t bitmap_count; - unsigned long x; - struct buffer_head *bitmap_bh = NULL; - - es = EXT3_SB(sb)->s_es; - desc_count = 0; - bitmap_count = 0; - gdp = NULL; - - smp_rmb(); - for (i = 0; i < ngroups; i++) { - gdp = ext3_get_group_desc(sb, i, NULL); - if (!gdp) - continue; - desc_count += le16_to_cpu(gdp->bg_free_blocks_count); - brelse(bitmap_bh); - bitmap_bh = read_block_bitmap(sb, i); - if (bitmap_bh == NULL) - continue; - - x = ext3_count_free(bitmap_bh, sb->s_blocksize); - printk("group %d: stored = %d, counted = %lu\n", - i, le16_to_cpu(gdp->bg_free_blocks_count), x); - bitmap_count += x; - } - brelse(bitmap_bh); - printk("ext3_count_free_blocks: stored = "E3FSBLK - ", computed = "E3FSBLK", "E3FSBLK"\n", - (ext3_fsblk_t)le32_to_cpu(es->s_free_blocks_count), - desc_count, bitmap_count); - return bitmap_count; -#else - desc_count = 0; - smp_rmb(); - for (i = 0; i < ngroups; i++) { - gdp = ext3_get_group_desc(sb, i, NULL); - if (!gdp) - continue; - desc_count += le16_to_cpu(gdp->bg_free_blocks_count); - } - - return desc_count; -#endif -} - -static inline int test_root(int a, int b) -{ - int num = b; - - while (a > num) - num *= b; - return num == a; -} - -static int ext3_group_sparse(int group) -{ - if (group <= 1) - return 1; - if (!(group & 1)) - return 0; - return (test_root(group, 7) || test_root(group, 5) || - test_root(group, 3)); -} - -/** - * ext3_bg_has_super - number of blocks used by the superblock in group - * @sb: superblock for filesystem - * @group: group number to check - * - * Return the number of blocks used by the superblock (primary or backup) - * in this group. Currently this will be only 0 or 1. - */ -int ext3_bg_has_super(struct super_block *sb, int group) -{ - if (EXT3_HAS_RO_COMPAT_FEATURE(sb, - EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER) && - !ext3_group_sparse(group)) - return 0; - return 1; -} - -static unsigned long ext3_bg_num_gdb_meta(struct super_block *sb, int group) -{ - unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb); - unsigned long first = metagroup * EXT3_DESC_PER_BLOCK(sb); - unsigned long last = first + EXT3_DESC_PER_BLOCK(sb) - 1; - - if (group == first || group == first + 1 || group == last) - return 1; - return 0; -} - -static unsigned long ext3_bg_num_gdb_nometa(struct super_block *sb, int group) -{ - return ext3_bg_has_super(sb, group) ? EXT3_SB(sb)->s_gdb_count : 0; -} - -/** - * ext3_bg_num_gdb - number of blocks used by the group table in group - * @sb: superblock for filesystem - * @group: group number to check - * - * Return the number of blocks used by the group descriptor table - * (primary or backup) in this group. In the future there may be a - * different number of descriptor blocks in each group. - */ -unsigned long ext3_bg_num_gdb(struct super_block *sb, int group) -{ - unsigned long first_meta_bg = - le32_to_cpu(EXT3_SB(sb)->s_es->s_first_meta_bg); - unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb); - - if (!EXT3_HAS_INCOMPAT_FEATURE(sb,EXT3_FEATURE_INCOMPAT_META_BG) || - metagroup < first_meta_bg) - return ext3_bg_num_gdb_nometa(sb,group); - - return ext3_bg_num_gdb_meta(sb,group); - -} - -/** - * ext3_trim_all_free -- function to trim all free space in alloc. group - * @sb: super block for file system - * @group: allocation group to trim - * @start: first group block to examine - * @max: last group block to examine - * @gdp: allocation group description structure - * @minblocks: minimum extent block count - * - * ext3_trim_all_free walks through group's block bitmap searching for free - * blocks. When the free block is found, it tries to allocate this block and - * consequent free block to get the biggest free extent possible, until it - * reaches any used block. Then issue a TRIM command on this extent and free - * the extent in the block bitmap. This is done until whole group is scanned. - */ -static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, - unsigned int group, - ext3_grpblk_t start, ext3_grpblk_t max, - ext3_grpblk_t minblocks) -{ - handle_t *handle; - ext3_grpblk_t next, free_blocks, bit, freed, count = 0; - ext3_fsblk_t discard_block; - struct ext3_sb_info *sbi; - struct buffer_head *gdp_bh, *bitmap_bh = NULL; - struct ext3_group_desc *gdp; - int err = 0, ret = 0; - - /* - * We will update one block bitmap, and one group descriptor - */ - handle = ext3_journal_start_sb(sb, 2); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - bitmap_bh = read_block_bitmap(sb, group); - if (!bitmap_bh) { - err = -EIO; - goto err_out; - } - - BUFFER_TRACE(bitmap_bh, "getting undo access"); - err = ext3_journal_get_undo_access(handle, bitmap_bh); - if (err) - goto err_out; - - gdp = ext3_get_group_desc(sb, group, &gdp_bh); - if (!gdp) { - err = -EIO; - goto err_out; - } - - BUFFER_TRACE(gdp_bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, gdp_bh); - if (err) - goto err_out; - - free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); - sbi = EXT3_SB(sb); - - /* Walk through the whole group */ - while (start <= max) { - start = bitmap_search_next_usable_block(start, bitmap_bh, max); - if (start < 0) - break; - next = start; - - /* - * Allocate contiguous free extents by setting bits in the - * block bitmap - */ - while (next <= max - && claim_block(sb_bgl_lock(sbi, group), - next, bitmap_bh)) { - next++; - } - - /* We did not claim any blocks */ - if (next == start) - continue; - - discard_block = (ext3_fsblk_t)start + - ext3_group_first_block_no(sb, group); - - /* Update counters */ - spin_lock(sb_bgl_lock(sbi, group)); - le16_add_cpu(&gdp->bg_free_blocks_count, start - next); - spin_unlock(sb_bgl_lock(sbi, group)); - percpu_counter_sub(&sbi->s_freeblocks_counter, next - start); - - free_blocks -= next - start; - /* Do not issue a TRIM on extents smaller than minblocks */ - if ((next - start) < minblocks) - goto free_extent; - - trace_ext3_discard_blocks(sb, discard_block, next - start); - /* Send the TRIM command down to the device */ - err = sb_issue_discard(sb, discard_block, next - start, - GFP_NOFS, 0); - count += (next - start); -free_extent: - freed = 0; - - /* - * Clear bits in the bitmap - */ - for (bit = start; bit < next; bit++) { - BUFFER_TRACE(bitmap_bh, "clear bit"); - if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group), - bit, bitmap_bh->b_data)) { - ext3_error(sb, __func__, - "bit already cleared for block "E3FSBLK, - (unsigned long)bit); - BUFFER_TRACE(bitmap_bh, "bit already cleared"); - } else { - freed++; - } - } - - /* Update couters */ - spin_lock(sb_bgl_lock(sbi, group)); - le16_add_cpu(&gdp->bg_free_blocks_count, freed); - spin_unlock(sb_bgl_lock(sbi, group)); - percpu_counter_add(&sbi->s_freeblocks_counter, freed); - - start = next; - if (err < 0) { - if (err != -EOPNOTSUPP) - ext3_warning(sb, __func__, "Discard command " - "returned error %d\n", err); - break; - } - - if (fatal_signal_pending(current)) { - err = -ERESTARTSYS; - break; - } - - cond_resched(); - - /* No more suitable extents */ - if (free_blocks < minblocks) - break; - } - - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - ret = ext3_journal_dirty_metadata(handle, bitmap_bh); - if (!err) - err = ret; - - /* And the group descriptor block */ - BUFFER_TRACE(gdp_bh, "dirtied group descriptor block"); - ret = ext3_journal_dirty_metadata(handle, gdp_bh); - if (!err) - err = ret; - - ext3_debug("trimmed %d blocks in the group %d\n", - count, group); - -err_out: - if (err) - count = err; - ext3_journal_stop(handle); - brelse(bitmap_bh); - - return count; -} - -/** - * ext3_trim_fs() -- trim ioctl handle function - * @sb: superblock for filesystem - * @start: First Byte to trim - * @len: number of Bytes to trim from start - * @minlen: minimum extent length in Bytes - * - * ext3_trim_fs goes through all allocation groups containing Bytes from - * start to start+len. For each such a group ext3_trim_all_free function - * is invoked to trim all free space. - */ -int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) -{ - ext3_grpblk_t last_block, first_block; - unsigned long group, first_group, last_group; - struct ext3_group_desc *gdp; - struct ext3_super_block *es = EXT3_SB(sb)->s_es; - uint64_t start, minlen, end, trimmed = 0; - ext3_fsblk_t first_data_blk = - le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block); - ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count); - int ret = 0; - - start = range->start >> sb->s_blocksize_bits; - end = start + (range->len >> sb->s_blocksize_bits) - 1; - minlen = range->minlen >> sb->s_blocksize_bits; - - if (minlen > EXT3_BLOCKS_PER_GROUP(sb) || - start >= max_blks || - range->len < sb->s_blocksize) - return -EINVAL; - if (end >= max_blks) - end = max_blks - 1; - if (end <= first_data_blk) - goto out; - if (start < first_data_blk) - start = first_data_blk; - - smp_rmb(); - - /* Determine first and last group to examine based on start and len */ - ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start, - &first_group, &first_block); - ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) end, - &last_group, &last_block); - - /* end now represents the last block to discard in this group */ - end = EXT3_BLOCKS_PER_GROUP(sb) - 1; - - for (group = first_group; group <= last_group; group++) { - gdp = ext3_get_group_desc(sb, group, NULL); - if (!gdp) - break; - - /* - * For all the groups except the last one, last block will - * always be EXT3_BLOCKS_PER_GROUP(sb)-1, so we only need to - * change it for the last group, note that last_block is - * already computed earlier by ext3_get_group_no_and_offset() - */ - if (group == last_group) - end = last_block; - - if (le16_to_cpu(gdp->bg_free_blocks_count) >= minlen) { - ret = ext3_trim_all_free(sb, group, first_block, - end, minlen); - if (ret < 0) - break; - trimmed += ret; - } - - /* - * For every group except the first one, we are sure - * that the first block to discard will be block #0. - */ - first_block = 0; - } - - if (ret > 0) - ret = 0; - -out: - range->len = trimmed * sb->s_blocksize; - return ret; -} diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c deleted file mode 100644 index ef9c643e8e9d..000000000000 --- a/fs/ext3/bitmap.c +++ /dev/null @@ -1,20 +0,0 @@ -/* - * linux/fs/ext3/bitmap.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - */ - -#include "ext3.h" - -#ifdef EXT3FS_DEBUG - -unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars) -{ - return numchars * BITS_PER_BYTE - memweight(map->b_data, numchars); -} - -#endif /* EXT3FS_DEBUG */ - diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c deleted file mode 100644 index 17742eed2c16..000000000000 --- a/fs/ext3/dir.c +++ /dev/null @@ -1,537 +0,0 @@ -/* - * linux/fs/ext3/dir.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/dir.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * ext3 directory handling functions - * - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - * - * Hash Tree Directory indexing (c) 2001 Daniel Phillips - * - */ - -#include <linux/compat.h> -#include "ext3.h" - -static unsigned char ext3_filetype_table[] = { - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK -}; - -static int ext3_dx_readdir(struct file *, struct dir_context *); - -static unsigned char get_dtype(struct super_block *sb, int filetype) -{ - if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) || - (filetype >= EXT3_FT_MAX)) - return DT_UNKNOWN; - - return (ext3_filetype_table[filetype]); -} - -/** - * Check if the given dir-inode refers to an htree-indexed directory - * (or a directory which could potentially get converted to use htree - * indexing). - * - * Return 1 if it is a dx dir, 0 if not - */ -static int is_dx_dir(struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - - if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, - EXT3_FEATURE_COMPAT_DIR_INDEX) && - ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || - ((inode->i_size >> sb->s_blocksize_bits) == 1))) - return 1; - - return 0; -} - -int ext3_check_dir_entry (const char * function, struct inode * dir, - struct ext3_dir_entry_2 * de, - struct buffer_head * bh, - unsigned long offset) -{ - const char * error_msg = NULL; - const int rlen = ext3_rec_len_from_disk(de->rec_len); - - if (unlikely(rlen < EXT3_DIR_REC_LEN(1))) - error_msg = "rec_len is smaller than minimal"; - else if (unlikely(rlen % 4 != 0)) - error_msg = "rec_len % 4 != 0"; - else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len))) - error_msg = "rec_len is too small for name_len"; - else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize))) - error_msg = "directory entry across blocks"; - else if (unlikely(le32_to_cpu(de->inode) > - le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))) - error_msg = "inode out of bounds"; - - if (unlikely(error_msg != NULL)) - ext3_error (dir->i_sb, function, - "bad entry in directory #%lu: %s - " - "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", - dir->i_ino, error_msg, offset, - (unsigned long) le32_to_cpu(de->inode), - rlen, de->name_len); - - return error_msg == NULL ? 1 : 0; -} - -static int ext3_readdir(struct file *file, struct dir_context *ctx) -{ - unsigned long offset; - int i; - struct ext3_dir_entry_2 *de; - int err; - struct inode *inode = file_inode(file); - struct super_block *sb = inode->i_sb; - int dir_has_error = 0; - - if (is_dx_dir(inode)) { - err = ext3_dx_readdir(file, ctx); - if (err != ERR_BAD_DX_DIR) - return err; - /* - * We don't set the inode dirty flag since it's not - * critical that it get flushed back to the disk. - */ - EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; - } - offset = ctx->pos & (sb->s_blocksize - 1); - - while (ctx->pos < inode->i_size) { - unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb); - struct buffer_head map_bh; - struct buffer_head *bh = NULL; - - map_bh.b_state = 0; - err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0); - if (err > 0) { - pgoff_t index = map_bh.b_blocknr >> - (PAGE_CACHE_SHIFT - inode->i_blkbits); - if (!ra_has_index(&file->f_ra, index)) - page_cache_sync_readahead( - sb->s_bdev->bd_inode->i_mapping, - &file->f_ra, file, - index, 1); - file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; - bh = ext3_bread(NULL, inode, blk, 0, &err); - } - - /* - * We ignore I/O errors on directories so users have a chance - * of recovering data when there's a bad sector - */ - if (!bh) { - if (!dir_has_error) { - ext3_error(sb, __func__, "directory #%lu " - "contains a hole at offset %lld", - inode->i_ino, ctx->pos); - dir_has_error = 1; - } - /* corrupt size? Maybe no more blocks to read */ - if (ctx->pos > inode->i_blocks << 9) - break; - ctx->pos += sb->s_blocksize - offset; - continue; - } - - /* If the dir block has changed since the last call to - * readdir(2), then we might be pointing to an invalid - * dirent right now. Scan from the start of the block - * to make sure. */ - if (offset && file->f_version != inode->i_version) { - for (i = 0; i < sb->s_blocksize && i < offset; ) { - de = (struct ext3_dir_entry_2 *) - (bh->b_data + i); - /* It's too expensive to do a full - * dirent test each time round this - * loop, but we do have to test at - * least that it is non-zero. A - * failure will be detected in the - * dirent test below. */ - if (ext3_rec_len_from_disk(de->rec_len) < - EXT3_DIR_REC_LEN(1)) - break; - i += ext3_rec_len_from_disk(de->rec_len); - } - offset = i; - ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) - | offset; - file->f_version = inode->i_version; - } - - while (ctx->pos < inode->i_size - && offset < sb->s_blocksize) { - de = (struct ext3_dir_entry_2 *) (bh->b_data + offset); - if (!ext3_check_dir_entry ("ext3_readdir", inode, de, - bh, offset)) { - /* On error, skip the to the - next block. */ - ctx->pos = (ctx->pos | - (sb->s_blocksize - 1)) + 1; - break; - } - offset += ext3_rec_len_from_disk(de->rec_len); - if (le32_to_cpu(de->inode)) { - if (!dir_emit(ctx, de->name, de->name_len, - le32_to_cpu(de->inode), - get_dtype(sb, de->file_type))) { - brelse(bh); - return 0; - } - } - ctx->pos += ext3_rec_len_from_disk(de->rec_len); - } - offset = 0; - brelse (bh); - if (ctx->pos < inode->i_size) - if (!dir_relax(inode)) - return 0; - } - return 0; -} - -static inline int is_32bit_api(void) -{ -#ifdef CONFIG_COMPAT - return is_compat_task(); -#else - return (BITS_PER_LONG == 32); -#endif -} - -/* - * These functions convert from the major/minor hash to an f_pos - * value for dx directories - * - * Upper layer (for example NFS) should specify FMODE_32BITHASH or - * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted - * directly on both 32-bit and 64-bit nodes, under such case, neither - * FMODE_32BITHASH nor FMODE_64BITHASH is specified. - */ -static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) -{ - if ((filp->f_mode & FMODE_32BITHASH) || - (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) - return major >> 1; - else - return ((__u64)(major >> 1) << 32) | (__u64)minor; -} - -static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) -{ - if ((filp->f_mode & FMODE_32BITHASH) || - (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) - return (pos << 1) & 0xffffffff; - else - return ((pos >> 32) << 1) & 0xffffffff; -} - -static inline __u32 pos2min_hash(struct file *filp, loff_t pos) -{ - if ((filp->f_mode & FMODE_32BITHASH) || - (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) - return 0; - else - return pos & 0xffffffff; -} - -/* - * Return 32- or 64-bit end-of-file for dx directories - */ -static inline loff_t ext3_get_htree_eof(struct file *filp) -{ - if ((filp->f_mode & FMODE_32BITHASH) || - (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) - return EXT3_HTREE_EOF_32BIT; - else - return EXT3_HTREE_EOF_64BIT; -} - - -/* - * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both - * non-htree and htree directories, where the "offset" is in terms - * of the filename hash value instead of the byte offset. - * - * Because we may return a 64-bit hash that is well beyond s_maxbytes, - * we need to pass the max hash as the maximum allowable offset in - * the htree directory case. - * - * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX) - * will be invalid once the directory was converted into a dx directory - */ -static loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence) -{ - struct inode *inode = file->f_mapping->host; - int dx_dir = is_dx_dir(inode); - loff_t htree_max = ext3_get_htree_eof(file); - - if (likely(dx_dir)) - return generic_file_llseek_size(file, offset, whence, - htree_max, htree_max); - else - return generic_file_llseek(file, offset, whence); -} - -/* - * This structure holds the nodes of the red-black tree used to store - * the directory entry in hash order. - */ -struct fname { - __u32 hash; - __u32 minor_hash; - struct rb_node rb_hash; - struct fname *next; - __u32 inode; - __u8 name_len; - __u8 file_type; - char name[0]; -}; - -/* - * This functoin implements a non-recursive way of freeing all of the - * nodes in the red-black tree. - */ -static void free_rb_tree_fname(struct rb_root *root) -{ - struct fname *fname, *next; - - rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) - do { - struct fname *old = fname; - fname = fname->next; - kfree(old); - } while (fname); - - *root = RB_ROOT; -} - -static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp, - loff_t pos) -{ - struct dir_private_info *p; - - p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); - if (!p) - return NULL; - p->curr_hash = pos2maj_hash(filp, pos); - p->curr_minor_hash = pos2min_hash(filp, pos); - return p; -} - -void ext3_htree_free_dir_info(struct dir_private_info *p) -{ - free_rb_tree_fname(&p->root); - kfree(p); -} - -/* - * Given a directory entry, enter it into the fname rb tree. - */ -int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, - __u32 minor_hash, - struct ext3_dir_entry_2 *dirent) -{ - struct rb_node **p, *parent = NULL; - struct fname * fname, *new_fn; - struct dir_private_info *info; - int len; - - info = (struct dir_private_info *) dir_file->private_data; - p = &info->root.rb_node; - - /* Create and allocate the fname structure */ - len = sizeof(struct fname) + dirent->name_len + 1; - new_fn = kzalloc(len, GFP_KERNEL); - if (!new_fn) - return -ENOMEM; - new_fn->hash = hash; - new_fn->minor_hash = minor_hash; - new_fn->inode = le32_to_cpu(dirent->inode); - new_fn->name_len = dirent->name_len; - new_fn->file_type = dirent->file_type; - memcpy(new_fn->name, dirent->name, dirent->name_len); - new_fn->name[dirent->name_len] = 0; - - while (*p) { - parent = *p; - fname = rb_entry(parent, struct fname, rb_hash); - - /* - * If the hash and minor hash match up, then we put - * them on a linked list. This rarely happens... - */ - if ((new_fn->hash == fname->hash) && - (new_fn->minor_hash == fname->minor_hash)) { - new_fn->next = fname->next; - fname->next = new_fn; - return 0; - } - - if (new_fn->hash < fname->hash) - p = &(*p)->rb_left; - else if (new_fn->hash > fname->hash) - p = &(*p)->rb_right; - else if (new_fn->minor_hash < fname->minor_hash) - p = &(*p)->rb_left; - else /* if (new_fn->minor_hash > fname->minor_hash) */ - p = &(*p)->rb_right; - } - - rb_link_node(&new_fn->rb_hash, parent, p); - rb_insert_color(&new_fn->rb_hash, &info->root); - return 0; -} - - - -/* - * This is a helper function for ext3_dx_readdir. It calls filldir - * for all entres on the fname linked list. (Normally there is only - * one entry on the linked list, unless there are 62 bit hash collisions.) - */ -static bool call_filldir(struct file *file, struct dir_context *ctx, - struct fname *fname) -{ - struct dir_private_info *info = file->private_data; - struct inode *inode = file_inode(file); - struct super_block *sb = inode->i_sb; - - if (!fname) { - printk("call_filldir: called with null fname?!?\n"); - return true; - } - ctx->pos = hash2pos(file, fname->hash, fname->minor_hash); - while (fname) { - if (!dir_emit(ctx, fname->name, fname->name_len, - fname->inode, - get_dtype(sb, fname->file_type))) { - info->extra_fname = fname; - return false; - } - fname = fname->next; - } - return true; -} - -static int ext3_dx_readdir(struct file *file, struct dir_context *ctx) -{ - struct dir_private_info *info = file->private_data; - struct inode *inode = file_inode(file); - struct fname *fname; - int ret; - - if (!info) { - info = ext3_htree_create_dir_info(file, ctx->pos); - if (!info) - return -ENOMEM; - file->private_data = info; - } - - if (ctx->pos == ext3_get_htree_eof(file)) - return 0; /* EOF */ - - /* Some one has messed with f_pos; reset the world */ - if (info->last_pos != ctx->pos) { - free_rb_tree_fname(&info->root); - info->curr_node = NULL; - info->extra_fname = NULL; - info->curr_hash = pos2maj_hash(file, ctx->pos); - info->curr_minor_hash = pos2min_hash(file, ctx->pos); - } - - /* - * If there are any leftover names on the hash collision - * chain, return them first. - */ - if (info->extra_fname) { - if (!call_filldir(file, ctx, info->extra_fname)) - goto finished; - info->extra_fname = NULL; - goto next_node; - } else if (!info->curr_node) - info->curr_node = rb_first(&info->root); - - while (1) { - /* - * Fill the rbtree if we have no more entries, - * or the inode has changed since we last read in the - * cached entries. - */ - if ((!info->curr_node) || - (file->f_version != inode->i_version)) { - info->curr_node = NULL; - free_rb_tree_fname(&info->root); - file->f_version = inode->i_version; - ret = ext3_htree_fill_tree(file, info->curr_hash, - info->curr_minor_hash, - &info->next_hash); - if (ret < 0) - return ret; - if (ret == 0) { - ctx->pos = ext3_get_htree_eof(file); - break; - } - info->curr_node = rb_first(&info->root); - } - - fname = rb_entry(info->curr_node, struct fname, rb_hash); - info->curr_hash = fname->hash; - info->curr_minor_hash = fname->minor_hash; - if (!call_filldir(file, ctx, fname)) - break; - next_node: - info->curr_node = rb_next(info->curr_node); - if (info->curr_node) { - fname = rb_entry(info->curr_node, struct fname, - rb_hash); - info->curr_hash = fname->hash; - info->curr_minor_hash = fname->minor_hash; - } else { - if (info->next_hash == ~0) { - ctx->pos = ext3_get_htree_eof(file); - break; - } - info->curr_hash = info->next_hash; - info->curr_minor_hash = 0; - } - } -finished: - info->last_pos = ctx->pos; - return 0; -} - -static int ext3_release_dir (struct inode * inode, struct file * filp) -{ - if (filp->private_data) - ext3_htree_free_dir_info(filp->private_data); - - return 0; -} - -const struct file_operations ext3_dir_operations = { - .llseek = ext3_dir_llseek, - .read = generic_read_dir, - .iterate = ext3_readdir, - .unlocked_ioctl = ext3_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ext3_compat_ioctl, -#endif - .fsync = ext3_sync_file, - .release = ext3_release_dir, -}; diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h deleted file mode 100644 index f483a80b3fe7..000000000000 --- a/fs/ext3/ext3.h +++ /dev/null @@ -1,1332 +0,0 @@ -/* - * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 - * - * Copyright 1998--1999 Red Hat corp --- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/include/linux/minix_fs.h - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/magic.h> -#include <linux/bug.h> -#include <linux/blockgroup_lock.h> - -/* - * The second extended filesystem constants/structures - */ - -/* - * Define EXT3FS_DEBUG to produce debug messages - */ -#undef EXT3FS_DEBUG - -/* - * Define EXT3_RESERVATION to reserve data blocks for expanding files - */ -#define EXT3_DEFAULT_RESERVE_BLOCKS 8 -/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */ -#define EXT3_MAX_RESERVE_BLOCKS 1027 -#define EXT3_RESERVE_WINDOW_NOT_ALLOCATED 0 - -/* - * Debug code - */ -#ifdef EXT3FS_DEBUG -#define ext3_debug(f, a...) \ - do { \ - printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:", \ - __FILE__, __LINE__, __func__); \ - printk (KERN_DEBUG f, ## a); \ - } while (0) -#else -#define ext3_debug(f, a...) do {} while (0) -#endif - -/* - * Special inodes numbers - */ -#define EXT3_BAD_INO 1 /* Bad blocks inode */ -#define EXT3_ROOT_INO 2 /* Root inode */ -#define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */ -#define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */ -#define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */ -#define EXT3_JOURNAL_INO 8 /* Journal inode */ - -/* First non-reserved inode for old ext3 filesystems */ -#define EXT3_GOOD_OLD_FIRST_INO 11 - -/* - * Maximal count of links to a file - */ -#define EXT3_LINK_MAX 32000 - -/* - * Macro-instructions used to manage several block sizes - */ -#define EXT3_MIN_BLOCK_SIZE 1024 -#define EXT3_MAX_BLOCK_SIZE 65536 -#define EXT3_MIN_BLOCK_LOG_SIZE 10 -#define EXT3_BLOCK_SIZE(s) ((s)->s_blocksize) -#define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) -#define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) -#define EXT3_ADDR_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_addr_per_block_bits) -#define EXT3_INODE_SIZE(s) (EXT3_SB(s)->s_inode_size) -#define EXT3_FIRST_INO(s) (EXT3_SB(s)->s_first_ino) - -/* - * Macro-instructions used to manage fragments - */ -#define EXT3_MIN_FRAG_SIZE 1024 -#define EXT3_MAX_FRAG_SIZE 4096 -#define EXT3_MIN_FRAG_LOG_SIZE 10 -#define EXT3_FRAG_SIZE(s) (EXT3_SB(s)->s_frag_size) -#define EXT3_FRAGS_PER_BLOCK(s) (EXT3_SB(s)->s_frags_per_block) - -/* - * Structure of a blocks group descriptor - */ -struct ext3_group_desc -{ - __le32 bg_block_bitmap; /* Blocks bitmap block */ - __le32 bg_inode_bitmap; /* Inodes bitmap block */ - __le32 bg_inode_table; /* Inodes table block */ - __le16 bg_free_blocks_count; /* Free blocks count */ - __le16 bg_free_inodes_count; /* Free inodes count */ - __le16 bg_used_dirs_count; /* Directories count */ - __u16 bg_pad; - __le32 bg_reserved[3]; -}; - -/* - * Macro-instructions used to manage group descriptors - */ -#define EXT3_BLOCKS_PER_GROUP(s) (EXT3_SB(s)->s_blocks_per_group) -#define EXT3_DESC_PER_BLOCK(s) (EXT3_SB(s)->s_desc_per_block) -#define EXT3_INODES_PER_GROUP(s) (EXT3_SB(s)->s_inodes_per_group) -#define EXT3_DESC_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_desc_per_block_bits) - -/* - * Constants relative to the data blocks - */ -#define EXT3_NDIR_BLOCKS 12 -#define EXT3_IND_BLOCK EXT3_NDIR_BLOCKS -#define EXT3_DIND_BLOCK (EXT3_IND_BLOCK + 1) -#define EXT3_TIND_BLOCK (EXT3_DIND_BLOCK + 1) -#define EXT3_N_BLOCKS (EXT3_TIND_BLOCK + 1) - -/* - * Inode flags - */ -#define EXT3_SECRM_FL 0x00000001 /* Secure deletion */ -#define EXT3_UNRM_FL 0x00000002 /* Undelete */ -#define EXT3_COMPR_FL 0x00000004 /* Compress file */ -#define EXT3_SYNC_FL 0x00000008 /* Synchronous updates */ -#define EXT3_IMMUTABLE_FL 0x00000010 /* Immutable file */ -#define EXT3_APPEND_FL 0x00000020 /* writes to file may only append */ -#define EXT3_NODUMP_FL 0x00000040 /* do not dump file */ -#define EXT3_NOATIME_FL 0x00000080 /* do not update atime */ -/* Reserved for compression usage... */ -#define EXT3_DIRTY_FL 0x00000100 -#define EXT3_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ -#define EXT3_NOCOMPR_FL 0x00000400 /* Don't compress */ -#define EXT3_ECOMPR_FL 0x00000800 /* Compression error */ -/* End compression flags --- maybe not all used */ -#define EXT3_INDEX_FL 0x00001000 /* hash-indexed directory */ -#define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */ -#define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ -#define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */ -#define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ -#define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ -#define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ - -#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ -#define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ - -/* Flags that should be inherited by new inodes from their parent. */ -#define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\ - EXT3_SYNC_FL | EXT3_NODUMP_FL |\ - EXT3_NOATIME_FL | EXT3_COMPRBLK_FL |\ - EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\ - EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL) - -/* Flags that are appropriate for regular files (all but dir-specific ones). */ -#define EXT3_REG_FLMASK (~(EXT3_DIRSYNC_FL | EXT3_TOPDIR_FL)) - -/* Flags that are appropriate for non-directories/regular files. */ -#define EXT3_OTHER_FLMASK (EXT3_NODUMP_FL | EXT3_NOATIME_FL) - -/* Mask out flags that are inappropriate for the given type of inode. */ -static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags) -{ - if (S_ISDIR(mode)) - return flags; - else if (S_ISREG(mode)) - return flags & EXT3_REG_FLMASK; - else - return flags & EXT3_OTHER_FLMASK; -} - -/* Used to pass group descriptor data when online resize is done */ -struct ext3_new_group_input { - __u32 group; /* Group number for this data */ - __u32 block_bitmap; /* Absolute block number of block bitmap */ - __u32 inode_bitmap; /* Absolute block number of inode bitmap */ - __u32 inode_table; /* Absolute block number of inode table start */ - __u32 blocks_count; /* Total number of blocks in this group */ - __u16 reserved_blocks; /* Number of reserved blocks in this group */ - __u16 unused; -}; - -/* The struct ext3_new_group_input in kernel space, with free_blocks_count */ -struct ext3_new_group_data { - __u32 group; - __u32 block_bitmap; - __u32 inode_bitmap; - __u32 inode_table; - __u32 blocks_count; - __u16 reserved_blocks; - __u16 unused; - __u32 free_blocks_count; -}; - - -/* - * ioctl commands - */ -#define EXT3_IOC_GETFLAGS FS_IOC_GETFLAGS -#define EXT3_IOC_SETFLAGS FS_IOC_SETFLAGS -#define EXT3_IOC_GETVERSION _IOR('f', 3, long) -#define EXT3_IOC_SETVERSION _IOW('f', 4, long) -#define EXT3_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) -#define EXT3_IOC_GROUP_ADD _IOW('f', 8,struct ext3_new_group_input) -#define EXT3_IOC_GETVERSION_OLD FS_IOC_GETVERSION -#define EXT3_IOC_SETVERSION_OLD FS_IOC_SETVERSION -#ifdef CONFIG_JBD_DEBUG -#define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) -#endif -#define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) -#define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) - -/* - * ioctl commands in 32 bit emulation - */ -#define EXT3_IOC32_GETFLAGS FS_IOC32_GETFLAGS -#define EXT3_IOC32_SETFLAGS FS_IOC32_SETFLAGS -#define EXT3_IOC32_GETVERSION _IOR('f', 3, int) -#define EXT3_IOC32_SETVERSION _IOW('f', 4, int) -#define EXT3_IOC32_GETRSVSZ _IOR('f', 5, int) -#define EXT3_IOC32_SETRSVSZ _IOW('f', 6, int) -#define EXT3_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) -#ifdef CONFIG_JBD_DEBUG -#define EXT3_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) -#endif -#define EXT3_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION -#define EXT3_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION - -/* Number of supported quota types */ -#define EXT3_MAXQUOTAS 2 - -/* - * Mount options - */ -struct ext3_mount_options { - unsigned long s_mount_opt; - kuid_t s_resuid; - kgid_t s_resgid; - unsigned long s_commit_interval; -#ifdef CONFIG_QUOTA - int s_jquota_fmt; - char *s_qf_names[EXT3_MAXQUOTAS]; -#endif -}; - -/* - * Structure of an inode on the disk - */ -struct ext3_inode { - __le16 i_mode; /* File mode */ - __le16 i_uid; /* Low 16 bits of Owner Uid */ - __le32 i_size; /* Size in bytes */ - __le32 i_atime; /* Access time */ - __le32 i_ctime; /* Creation time */ - __le32 i_mtime; /* Modification time */ - __le32 i_dtime; /* Deletion Time */ - __le16 i_gid; /* Low 16 bits of Group Id */ - __le16 i_links_count; /* Links count */ - __le32 i_blocks; /* Blocks count */ - __le32 i_flags; /* File flags */ - union { - struct { - __u32 l_i_reserved1; - } linux1; - struct { - __u32 h_i_translator; - } hurd1; - struct { - __u32 m_i_reserved1; - } masix1; - } osd1; /* OS dependent 1 */ - __le32 i_block[EXT3_N_BLOCKS];/* Pointers to blocks */ - __le32 i_generation; /* File version (for NFS) */ - __le32 i_file_acl; /* File ACL */ - __le32 i_dir_acl; /* Directory ACL */ - __le32 i_faddr; /* Fragment address */ - union { - struct { - __u8 l_i_frag; /* Fragment number */ - __u8 l_i_fsize; /* Fragment size */ - __u16 i_pad1; - __le16 l_i_uid_high; /* these 2 fields */ - __le16 l_i_gid_high; /* were reserved2[0] */ - __u32 l_i_reserved2; - } linux2; - struct { - __u8 h_i_frag; /* Fragment number */ - __u8 h_i_fsize; /* Fragment size */ - __u16 h_i_mode_high; - __u16 h_i_uid_high; - __u16 h_i_gid_high; - __u32 h_i_author; - } hurd2; - struct { - __u8 m_i_frag; /* Fragment number */ - __u8 m_i_fsize; /* Fragment size */ - __u16 m_pad1; - __u32 m_i_reserved2[2]; - } masix2; - } osd2; /* OS dependent 2 */ - __le16 i_extra_isize; - __le16 i_pad1; -}; - -#define i_size_high i_dir_acl - -#define i_reserved1 osd1.linux1.l_i_reserved1 -#define i_frag osd2.linux2.l_i_frag -#define i_fsize osd2.linux2.l_i_fsize -#define i_uid_low i_uid -#define i_gid_low i_gid -#define i_uid_high osd2.linux2.l_i_uid_high -#define i_gid_high osd2.linux2.l_i_gid_high -#define i_reserved2 osd2.linux2.l_i_reserved2 - -/* - * File system states - */ -#define EXT3_VALID_FS 0x0001 /* Unmounted cleanly */ -#define EXT3_ERROR_FS 0x0002 /* Errors detected */ -#define EXT3_ORPHAN_FS 0x0004 /* Orphans being recovered */ - -/* - * Misc. filesystem flags - */ -#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ -#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ -#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ - -/* - * Mount flags - */ -#define EXT3_MOUNT_CHECK 0x00001 /* Do mount-time checks */ -/* EXT3_MOUNT_OLDALLOC was there */ -#define EXT3_MOUNT_GRPID 0x00004 /* Create files with directory's group */ -#define EXT3_MOUNT_DEBUG 0x00008 /* Some debugging messages */ -#define EXT3_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ -#define EXT3_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ -#define EXT3_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ -#define EXT3_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ -#define EXT3_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ -#define EXT3_MOUNT_ABORT 0x00200 /* Fatal error detected */ -#define EXT3_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ -#define EXT3_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ -#define EXT3_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ -#define EXT3_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ -#define EXT3_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ -#define EXT3_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ -#define EXT3_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ -#define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ -#define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */ -#define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */ -#define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */ -#define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ -#define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ -#define EXT3_MOUNT_DATA_ERR_ABORT 0x400000 /* Abort on file data write - * error in ordered mode */ - -/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ -#ifndef _LINUX_EXT2_FS_H -#define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt -#define set_opt(o, opt) o |= EXT3_MOUNT_##opt -#define test_opt(sb, opt) (EXT3_SB(sb)->s_mount_opt & \ - EXT3_MOUNT_##opt) -#else -#define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD -#define EXT2_MOUNT_ABORT EXT3_MOUNT_ABORT -#define EXT2_MOUNT_DATA_FLAGS EXT3_MOUNT_DATA_FLAGS -#endif - -#define ext3_set_bit __set_bit_le -#define ext3_set_bit_atomic ext2_set_bit_atomic -#define ext3_clear_bit __clear_bit_le -#define ext3_clear_bit_atomic ext2_clear_bit_atomic -#define ext3_test_bit test_bit_le -#define ext3_find_next_zero_bit find_next_zero_bit_le - -/* - * Maximal mount counts between two filesystem checks - */ -#define EXT3_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ -#define EXT3_DFL_CHECKINTERVAL 0 /* Don't use interval check */ - -/* - * Behaviour when detecting errors - */ -#define EXT3_ERRORS_CONTINUE 1 /* Continue execution */ -#define EXT3_ERRORS_RO 2 /* Remount fs read-only */ -#define EXT3_ERRORS_PANIC 3 /* Panic */ -#define EXT3_ERRORS_DEFAULT EXT3_ERRORS_CONTINUE - -/* - * Structure of the super block - */ -struct ext3_super_block { -/*00*/ __le32 s_inodes_count; /* Inodes count */ - __le32 s_blocks_count; /* Blocks count */ - __le32 s_r_blocks_count; /* Reserved blocks count */ - __le32 s_free_blocks_count; /* Free blocks count */ -/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ - __le32 s_first_data_block; /* First Data Block */ - __le32 s_log_block_size; /* Block size */ - __le32 s_log_frag_size; /* Fragment size */ -/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ - __le32 s_frags_per_group; /* # Fragments per group */ - __le32 s_inodes_per_group; /* # Inodes per group */ - __le32 s_mtime; /* Mount time */ -/*30*/ __le32 s_wtime; /* Write time */ - __le16 s_mnt_count; /* Mount count */ - __le16 s_max_mnt_count; /* Maximal mount count */ - __le16 s_magic; /* Magic signature */ - __le16 s_state; /* File system state */ - __le16 s_errors; /* Behaviour when detecting errors */ - __le16 s_minor_rev_level; /* minor revision level */ -/*40*/ __le32 s_lastcheck; /* time of last check */ - __le32 s_checkinterval; /* max. time between checks */ - __le32 s_creator_os; /* OS */ - __le32 s_rev_level; /* Revision level */ -/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ - __le16 s_def_resgid; /* Default gid for reserved blocks */ - /* - * These fields are for EXT3_DYNAMIC_REV superblocks only. - * - * Note: the difference between the compatible feature set and - * the incompatible feature set is that if there is a bit set - * in the incompatible feature set that the kernel doesn't - * know about, it should refuse to mount the filesystem. - * - * e2fsck's requirements are more strict; if it doesn't know - * about a feature in either the compatible or incompatible - * feature set, it must abort and not try to meddle with - * things it doesn't understand... - */ - __le32 s_first_ino; /* First non-reserved inode */ - __le16 s_inode_size; /* size of inode structure */ - __le16 s_block_group_nr; /* block group # of this superblock */ - __le32 s_feature_compat; /* compatible feature set */ -/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ - __le32 s_feature_ro_compat; /* readonly-compatible feature set */ -/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ -/*78*/ char s_volume_name[16]; /* volume name */ -/*88*/ char s_last_mounted[64]; /* directory where last mounted */ -/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ - /* - * Performance hints. Directory preallocation should only - * happen if the EXT3_FEATURE_COMPAT_DIR_PREALLOC flag is on. - */ - __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ - __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ - __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ - /* - * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set. - */ -/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ -/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ - __le32 s_journal_dev; /* device number of journal file */ - __le32 s_last_orphan; /* start of list of inodes to delete */ - __le32 s_hash_seed[4]; /* HTREE hash seed */ - __u8 s_def_hash_version; /* Default hash version to use */ - __u8 s_reserved_char_pad; - __u16 s_reserved_word_pad; - __le32 s_default_mount_opts; - __le32 s_first_meta_bg; /* First metablock block group */ - __le32 s_mkfs_time; /* When the filesystem was created */ - __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ - /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ -/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ - __le32 s_r_blocks_count_hi; /* Reserved blocks count */ - __le32 s_free_blocks_count_hi; /* Free blocks count */ - __le16 s_min_extra_isize; /* All inodes have at least # bytes */ - __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ - __le32 s_flags; /* Miscellaneous flags */ - __le16 s_raid_stride; /* RAID stride */ - __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ - __le64 s_mmp_block; /* Block for multi-mount protection */ - __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ - __u8 s_log_groups_per_flex; /* FLEX_BG group size */ - __u8 s_reserved_char_pad2; - __le16 s_reserved_pad; - __u32 s_reserved[162]; /* Padding to the end of the block */ -}; - -/* data type for block offset of block group */ -typedef int ext3_grpblk_t; - -/* data type for filesystem-wide blocks number */ -typedef unsigned long ext3_fsblk_t; - -#define E3FSBLK "%lu" - -struct ext3_reserve_window { - ext3_fsblk_t _rsv_start; /* First byte reserved */ - ext3_fsblk_t _rsv_end; /* Last byte reserved or 0 */ -}; - -struct ext3_reserve_window_node { - struct rb_node rsv_node; - __u32 rsv_goal_size; - __u32 rsv_alloc_hit; - struct ext3_reserve_window rsv_window; -}; - -struct ext3_block_alloc_info { - /* information about reservation window */ - struct ext3_reserve_window_node rsv_window_node; - /* - * was i_next_alloc_block in ext3_inode_info - * is the logical (file-relative) number of the - * most-recently-allocated block in this file. - * We use this for detecting linearly ascending allocation requests. - */ - __u32 last_alloc_logical_block; - /* - * Was i_next_alloc_goal in ext3_inode_info - * is the *physical* companion to i_next_alloc_block. - * it the physical block number of the block which was most-recentl - * allocated to this file. This give us the goal (target) for the next - * allocation when we detect linearly ascending requests. - */ - ext3_fsblk_t last_alloc_physical_block; -}; - -#define rsv_start rsv_window._rsv_start -#define rsv_end rsv_window._rsv_end - -/* - * third extended file system inode data in memory - */ -struct ext3_inode_info { - __le32 i_data[15]; /* unconverted */ - __u32 i_flags; -#ifdef EXT3_FRAGMENTS - __u32 i_faddr; - __u8 i_frag_no; - __u8 i_frag_size; -#endif - ext3_fsblk_t i_file_acl; - __u32 i_dir_acl; - __u32 i_dtime; - - /* - * i_block_group is the number of the block group which contains - * this file's inode. Constant across the lifetime of the inode, - * it is ued for making block allocation decisions - we try to - * place a file's data blocks near its inode block, and new inodes - * near to their parent directory's inode. - */ - __u32 i_block_group; - unsigned long i_state_flags; /* Dynamic state flags for ext3 */ - - /* block reservation info */ - struct ext3_block_alloc_info *i_block_alloc_info; - - __u32 i_dir_start_lookup; -#ifdef CONFIG_EXT3_FS_XATTR - /* - * Extended attributes can be read independently of the main file - * data. Taking i_mutex even when reading would cause contention - * between readers of EAs and writers of regular file data, so - * instead we synchronize on xattr_sem when reading or changing - * EAs. - */ - struct rw_semaphore xattr_sem; -#endif - - struct list_head i_orphan; /* unlinked but open inodes */ - - /* - * i_disksize keeps track of what the inode size is ON DISK, not - * in memory. During truncate, i_size is set to the new size by - * the VFS prior to calling ext3_truncate(), but the filesystem won't - * set i_disksize to 0 until the truncate is actually under way. - * - * The intent is that i_disksize always represents the blocks which - * are used by this file. This allows recovery to restart truncate - * on orphans if we crash during truncate. We actually write i_disksize - * into the on-disk inode when writing inodes out, instead of i_size. - * - * The only time when i_disksize and i_size may be different is when - * a truncate is in progress. The only things which change i_disksize - * are ext3_get_block (growth) and ext3_truncate (shrinkth). - */ - loff_t i_disksize; - - /* on-disk additional length */ - __u16 i_extra_isize; - - /* - * truncate_mutex is for serialising ext3_truncate() against - * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's - * data tree are chopped off during truncate. We can't do that in - * ext3 because whenever we perform intermediate commits during - * truncate, the inode and all the metadata blocks *must* be in a - * consistent state which allows truncation of the orphans to restart - * during recovery. Hence we must fix the get_block-vs-truncate race - * by other means, so we have truncate_mutex. - */ - struct mutex truncate_mutex; - - /* - * Transactions that contain inode's metadata needed to complete - * fsync and fdatasync, respectively. - */ - atomic_t i_sync_tid; - atomic_t i_datasync_tid; - -#ifdef CONFIG_QUOTA - struct dquot *i_dquot[MAXQUOTAS]; -#endif - - struct inode vfs_inode; -}; - -/* - * third extended-fs super-block data in memory - */ -struct ext3_sb_info { - unsigned long s_frag_size; /* Size of a fragment in bytes */ - unsigned long s_frags_per_block;/* Number of fragments per block */ - unsigned long s_inodes_per_block;/* Number of inodes per block */ - unsigned long s_frags_per_group;/* Number of fragments in a group */ - unsigned long s_blocks_per_group;/* Number of blocks in a group */ - unsigned long s_inodes_per_group;/* Number of inodes in a group */ - unsigned long s_itb_per_group; /* Number of inode table blocks per group */ - unsigned long s_gdb_count; /* Number of group descriptor blocks */ - unsigned long s_desc_per_block; /* Number of group descriptors per block */ - unsigned long s_groups_count; /* Number of groups in the fs */ - unsigned long s_overhead_last; /* Last calculated overhead */ - unsigned long s_blocks_last; /* Last seen block count */ - struct buffer_head * s_sbh; /* Buffer containing the super block */ - struct ext3_super_block * s_es; /* Pointer to the super block in the buffer */ - struct buffer_head ** s_group_desc; - unsigned long s_mount_opt; - ext3_fsblk_t s_sb_block; - kuid_t s_resuid; - kgid_t s_resgid; - unsigned short s_mount_state; - unsigned short s_pad; - int s_addr_per_block_bits; - int s_desc_per_block_bits; - int s_inode_size; - int s_first_ino; - spinlock_t s_next_gen_lock; - u32 s_next_generation; - u32 s_hash_seed[4]; - int s_def_hash_version; - int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ - struct percpu_counter s_freeblocks_counter; - struct percpu_counter s_freeinodes_counter; - struct percpu_counter s_dirs_counter; - struct blockgroup_lock *s_blockgroup_lock; - - /* root of the per fs reservation window tree */ - spinlock_t s_rsv_window_lock; - struct rb_root s_rsv_window_root; - struct ext3_reserve_window_node s_rsv_window_head; - - /* Journaling */ - struct inode * s_journal_inode; - struct journal_s * s_journal; - struct list_head s_orphan; - struct mutex s_orphan_lock; - struct mutex s_resize_lock; - unsigned long s_commit_interval; - struct block_device *journal_bdev; -#ifdef CONFIG_QUOTA - char *s_qf_names[EXT3_MAXQUOTAS]; /* Names of quota files with journalled quota */ - int s_jquota_fmt; /* Format of quota to use */ -#endif -}; - -static inline spinlock_t * -sb_bgl_lock(struct ext3_sb_info *sbi, unsigned int block_group) -{ - return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group); -} - -static inline struct ext3_sb_info * EXT3_SB(struct super_block *sb) -{ - return sb->s_fs_info; -} -static inline struct ext3_inode_info *EXT3_I(struct inode *inode) -{ - return container_of(inode, struct ext3_inode_info, vfs_inode); -} - -static inline int ext3_valid_inum(struct super_block *sb, unsigned long ino) -{ - return ino == EXT3_ROOT_INO || - ino == EXT3_JOURNAL_INO || - ino == EXT3_RESIZE_INO || - (ino >= EXT3_FIRST_INO(sb) && - ino <= le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)); -} - -/* - * Inode dynamic state flags - */ -enum { - EXT3_STATE_JDATA, /* journaled data exists */ - EXT3_STATE_NEW, /* inode is newly created */ - EXT3_STATE_XATTR, /* has in-inode xattrs */ - EXT3_STATE_FLUSH_ON_CLOSE, /* flush dirty pages on close */ -}; - -static inline int ext3_test_inode_state(struct inode *inode, int bit) -{ - return test_bit(bit, &EXT3_I(inode)->i_state_flags); -} - -static inline void ext3_set_inode_state(struct inode *inode, int bit) -{ - set_bit(bit, &EXT3_I(inode)->i_state_flags); -} - -static inline void ext3_clear_inode_state(struct inode *inode, int bit) -{ - clear_bit(bit, &EXT3_I(inode)->i_state_flags); -} - -#define NEXT_ORPHAN(inode) EXT3_I(inode)->i_dtime - -/* - * Codes for operating systems - */ -#define EXT3_OS_LINUX 0 -#define EXT3_OS_HURD 1 -#define EXT3_OS_MASIX 2 -#define EXT3_OS_FREEBSD 3 -#define EXT3_OS_LITES 4 - -/* - * Revision levels - */ -#define EXT3_GOOD_OLD_REV 0 /* The good old (original) format */ -#define EXT3_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ - -#define EXT3_CURRENT_REV EXT3_GOOD_OLD_REV -#define EXT3_MAX_SUPP_REV EXT3_DYNAMIC_REV - -#define EXT3_GOOD_OLD_INODE_SIZE 128 - -/* - * Feature set definitions - */ - -#define EXT3_HAS_COMPAT_FEATURE(sb,mask) \ - ( EXT3_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) ) -#define EXT3_HAS_RO_COMPAT_FEATURE(sb,mask) \ - ( EXT3_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) ) -#define EXT3_HAS_INCOMPAT_FEATURE(sb,mask) \ - ( EXT3_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) ) -#define EXT3_SET_COMPAT_FEATURE(sb,mask) \ - EXT3_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) -#define EXT3_SET_RO_COMPAT_FEATURE(sb,mask) \ - EXT3_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) -#define EXT3_SET_INCOMPAT_FEATURE(sb,mask) \ - EXT3_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) -#define EXT3_CLEAR_COMPAT_FEATURE(sb,mask) \ - EXT3_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) -#define EXT3_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ - EXT3_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) -#define EXT3_CLEAR_INCOMPAT_FEATURE(sb,mask) \ - EXT3_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) - -#define EXT3_FEATURE_COMPAT_DIR_PREALLOC 0x0001 -#define EXT3_FEATURE_COMPAT_IMAGIC_INODES 0x0002 -#define EXT3_FEATURE_COMPAT_HAS_JOURNAL 0x0004 -#define EXT3_FEATURE_COMPAT_EXT_ATTR 0x0008 -#define EXT3_FEATURE_COMPAT_RESIZE_INODE 0x0010 -#define EXT3_FEATURE_COMPAT_DIR_INDEX 0x0020 - -#define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 -#define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 -#define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 - -#define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001 -#define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002 -#define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ -#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ -#define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 - -#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR -#define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ - EXT3_FEATURE_INCOMPAT_RECOVER| \ - EXT3_FEATURE_INCOMPAT_META_BG) -#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ - EXT3_FEATURE_RO_COMPAT_BTREE_DIR) - -/* - * Default values for user and/or group using reserved blocks - */ -#define EXT3_DEF_RESUID 0 -#define EXT3_DEF_RESGID 0 - -/* - * Default mount options - */ -#define EXT3_DEFM_DEBUG 0x0001 -#define EXT3_DEFM_BSDGROUPS 0x0002 -#define EXT3_DEFM_XATTR_USER 0x0004 -#define EXT3_DEFM_ACL 0x0008 -#define EXT3_DEFM_UID16 0x0010 -#define EXT3_DEFM_JMODE 0x0060 -#define EXT3_DEFM_JMODE_DATA 0x0020 -#define EXT3_DEFM_JMODE_ORDERED 0x0040 -#define EXT3_DEFM_JMODE_WBACK 0x0060 - -/* - * Structure of a directory entry - */ -#define EXT3_NAME_LEN 255 - -struct ext3_dir_entry { - __le32 inode; /* Inode number */ - __le16 rec_len; /* Directory entry length */ - __le16 name_len; /* Name length */ - char name[EXT3_NAME_LEN]; /* File name */ -}; - -/* - * The new version of the directory entry. Since EXT3 structures are - * stored in intel byte order, and the name_len field could never be - * bigger than 255 chars, it's safe to reclaim the extra byte for the - * file_type field. - */ -struct ext3_dir_entry_2 { - __le32 inode; /* Inode number */ - __le16 rec_len; /* Directory entry length */ - __u8 name_len; /* Name length */ - __u8 file_type; - char name[EXT3_NAME_LEN]; /* File name */ -}; - -/* - * Ext3 directory file types. Only the low 3 bits are used. The - * other bits are reserved for now. - */ -#define EXT3_FT_UNKNOWN 0 -#define EXT3_FT_REG_FILE 1 -#define EXT3_FT_DIR 2 -#define EXT3_FT_CHRDEV 3 -#define EXT3_FT_BLKDEV 4 -#define EXT3_FT_FIFO 5 -#define EXT3_FT_SOCK 6 -#define EXT3_FT_SYMLINK 7 - -#define EXT3_FT_MAX 8 - -/* - * EXT3_DIR_PAD defines the directory entries boundaries - * - * NOTE: It must be a multiple of 4 - */ -#define EXT3_DIR_PAD 4 -#define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1) -#define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \ - ~EXT3_DIR_ROUND) -#define EXT3_MAX_REC_LEN ((1<<16)-1) - -/* - * Tests against MAX_REC_LEN etc were put in place for 64k block - * sizes; if that is not possible on this arch, we can skip - * those tests and speed things up. - */ -static inline unsigned ext3_rec_len_from_disk(__le16 dlen) -{ - unsigned len = le16_to_cpu(dlen); - -#if (PAGE_CACHE_SIZE >= 65536) - if (len == EXT3_MAX_REC_LEN) - return 1 << 16; -#endif - return len; -} - -static inline __le16 ext3_rec_len_to_disk(unsigned len) -{ -#if (PAGE_CACHE_SIZE >= 65536) - if (len == (1 << 16)) - return cpu_to_le16(EXT3_MAX_REC_LEN); - else if (len > (1 << 16)) - BUG(); -#endif - return cpu_to_le16(len); -} - -/* - * Hash Tree Directory indexing - * (c) Daniel Phillips, 2001 - */ - -#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ - EXT3_FEATURE_COMPAT_DIR_INDEX) && \ - (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) -#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) -#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) - -/* Legal values for the dx_root hash_version field: */ - -#define DX_HASH_LEGACY 0 -#define DX_HASH_HALF_MD4 1 -#define DX_HASH_TEA 2 -#define DX_HASH_LEGACY_UNSIGNED 3 -#define DX_HASH_HALF_MD4_UNSIGNED 4 -#define DX_HASH_TEA_UNSIGNED 5 - -/* hash info structure used by the directory hash */ -struct dx_hash_info -{ - u32 hash; - u32 minor_hash; - int hash_version; - u32 *seed; -}; - - -/* 32 and 64 bit signed EOF for dx directories */ -#define EXT3_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) -#define EXT3_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) - - -/* - * Control parameters used by ext3_htree_next_block - */ -#define HASH_NB_ALWAYS 1 - - -/* - * Describe an inode's exact location on disk and in memory - */ -struct ext3_iloc -{ - struct buffer_head *bh; - unsigned long offset; - unsigned long block_group; -}; - -static inline struct ext3_inode *ext3_raw_inode(struct ext3_iloc *iloc) -{ - return (struct ext3_inode *) (iloc->bh->b_data + iloc->offset); -} - -/* - * This structure is stuffed into the struct file's private_data field - * for directories. It is where we put information so that we can do - * readdir operations in hash tree order. - */ -struct dir_private_info { - struct rb_root root; - struct rb_node *curr_node; - struct fname *extra_fname; - loff_t last_pos; - __u32 curr_hash; - __u32 curr_minor_hash; - __u32 next_hash; -}; - -/* calculate the first block number of the group */ -static inline ext3_fsblk_t -ext3_group_first_block_no(struct super_block *sb, unsigned long group_no) -{ - return group_no * (ext3_fsblk_t)EXT3_BLOCKS_PER_GROUP(sb) + - le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block); -} - -/* - * Special error return code only used by dx_probe() and its callers. - */ -#define ERR_BAD_DX_DIR -75000 - -/* - * Function prototypes - */ - -/* - * Ok, these declarations are also in <linux/kernel.h> but none of the - * ext3 source programs needs to include it so they are duplicated here. - */ -# define NORET_TYPE /**/ -# define ATTRIB_NORET __attribute__((noreturn)) -# define NORET_AND noreturn, - -/* balloc.c */ -extern int ext3_bg_has_super(struct super_block *sb, int group); -extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); -extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode, - ext3_fsblk_t goal, int *errp); -extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode, - ext3_fsblk_t goal, unsigned long *count, int *errp); -extern void ext3_free_blocks (handle_t *handle, struct inode *inode, - ext3_fsblk_t block, unsigned long count); -extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb, - ext3_fsblk_t block, unsigned long count, - unsigned long *pdquot_freed_blocks); -extern ext3_fsblk_t ext3_count_free_blocks (struct super_block *); -extern void ext3_check_blocks_bitmap (struct super_block *); -extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, - unsigned int block_group, - struct buffer_head ** bh); -extern int ext3_should_retry_alloc(struct super_block *sb, int *retries); -extern void ext3_init_block_alloc_info(struct inode *); -extern void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv); -extern int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range); - -/* dir.c */ -extern int ext3_check_dir_entry(const char *, struct inode *, - struct ext3_dir_entry_2 *, - struct buffer_head *, unsigned long); -extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, - __u32 minor_hash, - struct ext3_dir_entry_2 *dirent); -extern void ext3_htree_free_dir_info(struct dir_private_info *p); - -/* fsync.c */ -extern int ext3_sync_file(struct file *, loff_t, loff_t, int); - -/* hash.c */ -extern int ext3fs_dirhash(const char *name, int len, struct - dx_hash_info *hinfo); - -/* ialloc.c */ -extern struct inode * ext3_new_inode (handle_t *, struct inode *, - const struct qstr *, umode_t); -extern void ext3_free_inode (handle_t *, struct inode *); -extern struct inode * ext3_orphan_get (struct super_block *, unsigned long); -extern unsigned long ext3_count_free_inodes (struct super_block *); -extern unsigned long ext3_count_dirs (struct super_block *); -extern void ext3_check_inodes_bitmap (struct super_block *); -extern unsigned long ext3_count_free (struct buffer_head *, unsigned); - - -/* inode.c */ -int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, - struct buffer_head *bh, ext3_fsblk_t blocknr); -struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); -struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); -int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, - sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, - int create); - -extern struct inode *ext3_iget(struct super_block *, unsigned long); -extern int ext3_write_inode (struct inode *, struct writeback_control *); -extern int ext3_setattr (struct dentry *, struct iattr *); -extern void ext3_evict_inode (struct inode *); -extern int ext3_sync_inode (handle_t *, struct inode *); -extern void ext3_discard_reservation (struct inode *); -extern void ext3_dirty_inode(struct inode *, int); -extern int ext3_change_inode_journal_flag(struct inode *, int); -extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *); -extern int ext3_can_truncate(struct inode *inode); -extern void ext3_truncate(struct inode *inode); -extern void ext3_set_inode_flags(struct inode *); -extern void ext3_get_inode_flags(struct ext3_inode_info *); -extern void ext3_set_aops(struct inode *inode); -extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - u64 start, u64 len); - -/* ioctl.c */ -extern long ext3_ioctl(struct file *, unsigned int, unsigned long); -extern long ext3_compat_ioctl(struct file *, unsigned int, unsigned long); - -/* namei.c */ -extern int ext3_orphan_add(handle_t *, struct inode *); -extern int ext3_orphan_del(handle_t *, struct inode *); -extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, - __u32 start_minor_hash, __u32 *next_hash); - -/* resize.c */ -extern int ext3_group_add(struct super_block *sb, - struct ext3_new_group_data *input); -extern int ext3_group_extend(struct super_block *sb, - struct ext3_super_block *es, - ext3_fsblk_t n_blocks_count); - -/* super.c */ -extern __printf(3, 4) -void ext3_error(struct super_block *, const char *, const char *, ...); -extern void __ext3_std_error (struct super_block *, const char *, int); -extern __printf(3, 4) -void ext3_abort(struct super_block *, const char *, const char *, ...); -extern __printf(3, 4) -void ext3_warning(struct super_block *, const char *, const char *, ...); -extern __printf(3, 4) -void ext3_msg(struct super_block *, const char *, const char *, ...); -extern void ext3_update_dynamic_rev (struct super_block *sb); - -#define ext3_std_error(sb, errno) \ -do { \ - if ((errno)) \ - __ext3_std_error((sb), __func__, (errno)); \ -} while (0) - -/* - * Inodes and files operations - */ - -/* dir.c */ -extern const struct file_operations ext3_dir_operations; - -/* file.c */ -extern const struct inode_operations ext3_file_inode_operations; -extern const struct file_operations ext3_file_operations; - -/* namei.c */ -extern const struct inode_operations ext3_dir_inode_operations; -extern const struct inode_operations ext3_special_inode_operations; - -/* symlink.c */ -extern const struct inode_operations ext3_symlink_inode_operations; -extern const struct inode_operations ext3_fast_symlink_inode_operations; - -#define EXT3_JOURNAL(inode) (EXT3_SB((inode)->i_sb)->s_journal) - -/* Define the number of blocks we need to account to a transaction to - * modify one block of data. - * - * We may have to touch one inode, one bitmap buffer, up to three - * indirection blocks, the group and superblock summaries, and the data - * block to complete the transaction. */ - -#define EXT3_SINGLEDATA_TRANS_BLOCKS 8U - -/* Extended attribute operations touch at most two data buffers, - * two bitmap buffers, and two group summaries, in addition to the inode - * and the superblock, which are already accounted for. */ - -#define EXT3_XATTR_TRANS_BLOCKS 6U - -/* Define the minimum size for a transaction which modifies data. This - * needs to take into account the fact that we may end up modifying two - * quota files too (one for the group, one for the user quota). The - * superblock only gets updated once, of course, so don't bother - * counting that again for the quota updates. */ - -#define EXT3_DATA_TRANS_BLOCKS(sb) (EXT3_SINGLEDATA_TRANS_BLOCKS + \ - EXT3_XATTR_TRANS_BLOCKS - 2 + \ - EXT3_MAXQUOTAS_TRANS_BLOCKS(sb)) - -/* Delete operations potentially hit one directory's namespace plus an - * entire inode, plus arbitrary amounts of bitmap/indirection data. Be - * generous. We can grow the delete transaction later if necessary. */ - -#define EXT3_DELETE_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) + 64) - -/* Define an arbitrary limit for the amount of data we will anticipate - * writing to any given transaction. For unbounded transactions such as - * write(2) and truncate(2) we can write more than this, but we always - * start off at the maximum transaction size and grow the transaction - * optimistically as we go. */ - -#define EXT3_MAX_TRANS_DATA 64U - -/* We break up a large truncate or write transaction once the handle's - * buffer credits gets this low, we need either to extend the - * transaction or to start a new one. Reserve enough space here for - * inode, bitmap, superblock, group and indirection updates for at least - * one block, plus two quota updates. Quota allocations are not - * needed. */ - -#define EXT3_RESERVE_TRANS_BLOCKS 12U - -#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8 - -#ifdef CONFIG_QUOTA -/* Amount of blocks needed for quota update - we know that the structure was - * allocated so we need to update only inode+data */ -#define EXT3_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0) -/* Amount of blocks needed for quota insert/delete - we do some block writes - * but inode, sb and group updates are done only once */ -#define EXT3_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ - (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0) -#define EXT3_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ - (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0) -#else -#define EXT3_QUOTA_TRANS_BLOCKS(sb) 0 -#define EXT3_QUOTA_INIT_BLOCKS(sb) 0 -#define EXT3_QUOTA_DEL_BLOCKS(sb) 0 -#endif -#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb)) -#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb)) -#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb)) - -int -ext3_mark_iloc_dirty(handle_t *handle, - struct inode *inode, - struct ext3_iloc *iloc); - -/* - * On success, We end up with an outstanding reference count against - * iloc->bh. This _must_ be cleaned up later. - */ - -int ext3_reserve_inode_write(handle_t *handle, struct inode *inode, - struct ext3_iloc *iloc); - -int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode); - -/* - * Wrapper functions with which ext3 calls into JBD. The intent here is - * to allow these to be turned into appropriate stubs so ext3 can control - * ext2 filesystems, so ext2+ext3 systems only nee one fs. This work hasn't - * been done yet. - */ - -static inline void ext3_journal_release_buffer(handle_t *handle, - struct buffer_head *bh) -{ - journal_release_buffer(handle, bh); -} - -void ext3_journal_abort_handle(const char *caller, const char *err_fn, - struct buffer_head *bh, handle_t *handle, int err); - -int __ext3_journal_get_undo_access(const char *where, handle_t *handle, - struct buffer_head *bh); - -int __ext3_journal_get_write_access(const char *where, handle_t *handle, - struct buffer_head *bh); - -int __ext3_journal_forget(const char *where, handle_t *handle, - struct buffer_head *bh); - -int __ext3_journal_revoke(const char *where, handle_t *handle, - unsigned long blocknr, struct buffer_head *bh); - -int __ext3_journal_get_create_access(const char *where, - handle_t *handle, struct buffer_head *bh); - -int __ext3_journal_dirty_metadata(const char *where, - handle_t *handle, struct buffer_head *bh); - -#define ext3_journal_get_undo_access(handle, bh) \ - __ext3_journal_get_undo_access(__func__, (handle), (bh)) -#define ext3_journal_get_write_access(handle, bh) \ - __ext3_journal_get_write_access(__func__, (handle), (bh)) -#define ext3_journal_revoke(handle, blocknr, bh) \ - __ext3_journal_revoke(__func__, (handle), (blocknr), (bh)) -#define ext3_journal_get_create_access(handle, bh) \ - __ext3_journal_get_create_access(__func__, (handle), (bh)) -#define ext3_journal_dirty_metadata(handle, bh) \ - __ext3_journal_dirty_metadata(__func__, (handle), (bh)) -#define ext3_journal_forget(handle, bh) \ - __ext3_journal_forget(__func__, (handle), (bh)) - -int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh); - -handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks); -int __ext3_journal_stop(const char *where, handle_t *handle); - -static inline handle_t *ext3_journal_start(struct inode *inode, int nblocks) -{ - return ext3_journal_start_sb(inode->i_sb, nblocks); -} - -#define ext3_journal_stop(handle) \ - __ext3_journal_stop(__func__, (handle)) - -static inline handle_t *ext3_journal_current_handle(void) -{ - return journal_current_handle(); -} - -static inline int ext3_journal_extend(handle_t *handle, int nblocks) -{ - return journal_extend(handle, nblocks); -} - -static inline int ext3_journal_restart(handle_t *handle, int nblocks) -{ - return journal_restart(handle, nblocks); -} - -static inline int ext3_journal_blocks_per_page(struct inode *inode) -{ - return journal_blocks_per_page(inode); -} - -static inline int ext3_journal_force_commit(journal_t *journal) -{ - return journal_force_commit(journal); -} - -/* super.c */ -int ext3_force_commit(struct super_block *sb); - -static inline int ext3_should_journal_data(struct inode *inode) -{ - if (!S_ISREG(inode->i_mode)) - return 1; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA) - return 1; - if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL) - return 1; - return 0; -} - -static inline int ext3_should_order_data(struct inode *inode) -{ - if (!S_ISREG(inode->i_mode)) - return 0; - if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL) - return 0; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA) - return 1; - return 0; -} - -static inline int ext3_should_writeback_data(struct inode *inode) -{ - if (!S_ISREG(inode->i_mode)) - return 0; - if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL) - return 0; - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA) - return 1; - return 0; -} - -#include <trace/events/ext3.h> diff --git a/fs/ext3/ext3_jbd.c b/fs/ext3/ext3_jbd.c deleted file mode 100644 index 785a3261a26c..000000000000 --- a/fs/ext3/ext3_jbd.c +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Interface between ext3 and JBD - */ - -#include "ext3.h" - -int __ext3_journal_get_undo_access(const char *where, handle_t *handle, - struct buffer_head *bh) -{ - int err = journal_get_undo_access(handle, bh); - if (err) - ext3_journal_abort_handle(where, __func__, bh, handle,err); - return err; -} - -int __ext3_journal_get_write_access(const char *where, handle_t *handle, - struct buffer_head *bh) -{ - int err = journal_get_write_access(handle, bh); - if (err) - ext3_journal_abort_handle(where, __func__, bh, handle,err); - return err; -} - -int __ext3_journal_forget(const char *where, handle_t *handle, - struct buffer_head *bh) -{ - int err = journal_forget(handle, bh); - if (err) - ext3_journal_abort_handle(where, __func__, bh, handle,err); - return err; -} - -int __ext3_journal_revoke(const char *where, handle_t *handle, - unsigned long blocknr, struct buffer_head *bh) -{ - int err = journal_revoke(handle, blocknr, bh); - if (err) - ext3_journal_abort_handle(where, __func__, bh, handle,err); - return err; -} - -int __ext3_journal_get_create_access(const char *where, - handle_t *handle, struct buffer_head *bh) -{ - int err = journal_get_create_access(handle, bh); - if (err) - ext3_journal_abort_handle(where, __func__, bh, handle,err); - return err; -} - -int __ext3_journal_dirty_metadata(const char *where, - handle_t *handle, struct buffer_head *bh) -{ - int err = journal_dirty_metadata(handle, bh); - if (err) - ext3_journal_abort_handle(where, __func__, bh, handle,err); - return err; -} diff --git a/fs/ext3/file.c b/fs/ext3/file.c deleted file mode 100644 index 3b8f650de22c..000000000000 --- a/fs/ext3/file.c +++ /dev/null @@ -1,79 +0,0 @@ -/* - * linux/fs/ext3/file.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/file.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * ext3 fs regular file handling primitives - * - * 64-bit file support on 64-bit platforms by Jakub Jelinek - * (jj@sunsite.ms.mff.cuni.cz) - */ - -#include <linux/quotaops.h> -#include "ext3.h" -#include "xattr.h" -#include "acl.h" - -/* - * Called when an inode is released. Note that this is different - * from ext3_file_open: open gets called at every open, but release - * gets called only when /all/ the files are closed. - */ -static int ext3_release_file (struct inode * inode, struct file * filp) -{ - if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) { - filemap_flush(inode->i_mapping); - ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); - } - /* if we are the last writer on the inode, drop the block reservation */ - if ((filp->f_mode & FMODE_WRITE) && - (atomic_read(&inode->i_writecount) == 1)) - { - mutex_lock(&EXT3_I(inode)->truncate_mutex); - ext3_discard_reservation(inode); - mutex_unlock(&EXT3_I(inode)->truncate_mutex); - } - if (is_dx(inode) && filp->private_data) - ext3_htree_free_dir_info(filp->private_data); - - return 0; -} - -const struct file_operations ext3_file_operations = { - .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, - .unlocked_ioctl = ext3_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ext3_compat_ioctl, -#endif - .mmap = generic_file_mmap, - .open = dquot_file_open, - .release = ext3_release_file, - .fsync = ext3_sync_file, - .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, -}; - -const struct inode_operations ext3_file_inode_operations = { - .setattr = ext3_setattr, -#ifdef CONFIG_EXT3_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .listxattr = ext3_listxattr, - .removexattr = generic_removexattr, -#endif - .get_acl = ext3_get_acl, - .set_acl = ext3_set_acl, - .fiemap = ext3_fiemap, -}; - diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c deleted file mode 100644 index 1cb9c7e10c6f..000000000000 --- a/fs/ext3/fsync.c +++ /dev/null @@ -1,109 +0,0 @@ -/* - * linux/fs/ext3/fsync.c - * - * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com) - * from - * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * from - * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds - * - * ext3fs fsync primitive - * - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - * - * Removed unnecessary code duplication for little endian machines - * and excessive __inline__s. - * Andi Kleen, 1997 - * - * Major simplications and cleanup - we only need to do the metadata, because - * we can depend on generic_block_fdatasync() to sync the data blocks. - */ - -#include <linux/blkdev.h> -#include <linux/writeback.h> -#include "ext3.h" - -/* - * akpm: A new design for ext3_sync_file(). - * - * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). - * There cannot be a transaction open by this task. - * Another task could have dirtied this inode. Its data can be in any - * state in the journalling system. - * - * What we do is just kick off a commit and wait on it. This will snapshot the - * inode to disk. - */ - -int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) -{ - struct inode *inode = file->f_mapping->host; - struct ext3_inode_info *ei = EXT3_I(inode); - journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; - int ret, needs_barrier = 0; - tid_t commit_tid; - - trace_ext3_sync_file_enter(file, datasync); - - if (inode->i_sb->s_flags & MS_RDONLY) { - /* Make sure that we read updated state */ - smp_rmb(); - if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS) - return -EROFS; - return 0; - } - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - goto out; - - J_ASSERT(ext3_journal_current_handle() == NULL); - - /* - * data=writeback,ordered: - * The caller's filemap_fdatawrite()/wait will sync the data. - * Metadata is in the journal, we wait for a proper transaction - * to commit here. - * - * data=journal: - * filemap_fdatawrite won't do anything (the buffers are clean). - * ext3_force_commit will write the file data into the journal and - * will wait on that. - * filemap_fdatawait() will encounter a ton of newly-dirtied pages - * (they were dirtied by commit). But that's OK - the blocks are - * safe in-journal, which is all fsync() needs to ensure. - */ - if (ext3_should_journal_data(inode)) { - ret = ext3_force_commit(inode->i_sb); - goto out; - } - - if (datasync) - commit_tid = atomic_read(&ei->i_datasync_tid); - else - commit_tid = atomic_read(&ei->i_sync_tid); - - if (test_opt(inode->i_sb, BARRIER) && - !journal_trans_will_send_data_barrier(journal, commit_tid)) - needs_barrier = 1; - log_start_commit(journal, commit_tid); - ret = log_wait_commit(journal, commit_tid); - - /* - * In case we didn't commit a transaction, we have to flush - * disk caches manually so that data really is on persistent - * storage - */ - if (needs_barrier) { - int err; - - err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - if (!ret) - ret = err; - } -out: - trace_ext3_sync_file_exit(inode, ret); - return ret; -} diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c deleted file mode 100644 index ede315cdf126..000000000000 --- a/fs/ext3/hash.c +++ /dev/null @@ -1,206 +0,0 @@ -/* - * linux/fs/ext3/hash.c - * - * Copyright (C) 2002 by Theodore Ts'o - * - * This file is released under the GPL v2. - * - * This file may be redistributed under the terms of the GNU Public - * License. - */ - -#include "ext3.h" -#include <linux/cryptohash.h> - -#define DELTA 0x9E3779B9 - -static void TEA_transform(__u32 buf[4], __u32 const in[]) -{ - __u32 sum = 0; - __u32 b0 = buf[0], b1 = buf[1]; - __u32 a = in[0], b = in[1], c = in[2], d = in[3]; - int n = 16; - - do { - sum += DELTA; - b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); - b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); - } while(--n); - - buf[0] += b0; - buf[1] += b1; -} - - -/* The old legacy hash */ -static __u32 dx_hack_hash_unsigned(const char *name, int len) -{ - __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; - const unsigned char *ucp = (const unsigned char *) name; - - while (len--) { - hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373)); - - if (hash & 0x80000000) - hash -= 0x7fffffff; - hash1 = hash0; - hash0 = hash; - } - return hash0 << 1; -} - -static __u32 dx_hack_hash_signed(const char *name, int len) -{ - __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; - const signed char *scp = (const signed char *) name; - - while (len--) { - hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373)); - - if (hash & 0x80000000) - hash -= 0x7fffffff; - hash1 = hash0; - hash0 = hash; - } - return hash0 << 1; -} - -static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) -{ - __u32 pad, val; - int i; - const signed char *scp = (const signed char *) msg; - - pad = (__u32)len | ((__u32)len << 8); - pad |= pad << 16; - - val = pad; - if (len > num*4) - len = num * 4; - for (i = 0; i < len; i++) { - if ((i % 4) == 0) - val = pad; - val = ((int) scp[i]) + (val << 8); - if ((i % 4) == 3) { - *buf++ = val; - val = pad; - num--; - } - } - if (--num >= 0) - *buf++ = val; - while (--num >= 0) - *buf++ = pad; -} - -static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) -{ - __u32 pad, val; - int i; - const unsigned char *ucp = (const unsigned char *) msg; - - pad = (__u32)len | ((__u32)len << 8); - pad |= pad << 16; - - val = pad; - if (len > num*4) - len = num * 4; - for (i=0; i < len; i++) { - if ((i % 4) == 0) - val = pad; - val = ((int) ucp[i]) + (val << 8); - if ((i % 4) == 3) { - *buf++ = val; - val = pad; - num--; - } - } - if (--num >= 0) - *buf++ = val; - while (--num >= 0) - *buf++ = pad; -} - -/* - * Returns the hash of a filename. If len is 0 and name is NULL, then - * this function can be used to test whether or not a hash version is - * supported. - * - * The seed is an 4 longword (32 bits) "secret" which can be used to - * uniquify a hash. If the seed is all zero's, then some default seed - * may be used. - * - * A particular hash version specifies whether or not the seed is - * represented, and whether or not the returned hash is 32 bits or 64 - * bits. 32 bit hashes will return 0 for the minor hash. - */ -int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) -{ - __u32 hash; - __u32 minor_hash = 0; - const char *p; - int i; - __u32 in[8], buf[4]; - void (*str2hashbuf)(const char *, int, __u32 *, int) = - str2hashbuf_signed; - - /* Initialize the default seed for the hash checksum functions */ - buf[0] = 0x67452301; - buf[1] = 0xefcdab89; - buf[2] = 0x98badcfe; - buf[3] = 0x10325476; - - /* Check to see if the seed is all zero's */ - if (hinfo->seed) { - for (i=0; i < 4; i++) { - if (hinfo->seed[i]) - break; - } - if (i < 4) - memcpy(buf, hinfo->seed, sizeof(buf)); - } - - switch (hinfo->hash_version) { - case DX_HASH_LEGACY_UNSIGNED: - hash = dx_hack_hash_unsigned(name, len); - break; - case DX_HASH_LEGACY: - hash = dx_hack_hash_signed(name, len); - break; - case DX_HASH_HALF_MD4_UNSIGNED: - str2hashbuf = str2hashbuf_unsigned; - case DX_HASH_HALF_MD4: - p = name; - while (len > 0) { - (*str2hashbuf)(p, len, in, 8); - half_md4_transform(buf, in); - len -= 32; - p += 32; - } - minor_hash = buf[2]; - hash = buf[1]; - break; - case DX_HASH_TEA_UNSIGNED: - str2hashbuf = str2hashbuf_unsigned; - case DX_HASH_TEA: - p = name; - while (len > 0) { - (*str2hashbuf)(p, len, in, 4); - TEA_transform(buf, in); - len -= 16; - p += 16; - } - hash = buf[0]; - minor_hash = buf[1]; - break; - default: - hinfo->hash = 0; - return -1; - } - hash = hash & ~1; - if (hash == (EXT3_HTREE_EOF_32BIT << 1)) - hash = (EXT3_HTREE_EOF_32BIT - 1) << 1; - hinfo->hash = hash; - hinfo->minor_hash = minor_hash; - return 0; -} diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c deleted file mode 100644 index 3ad242e5840e..000000000000 --- a/fs/ext3/ialloc.c +++ /dev/null @@ -1,706 +0,0 @@ -/* - * linux/fs/ext3/ialloc.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * BSD ufs-inspired inode and directory allocation by - * Stephen Tweedie (sct@redhat.com), 1993 - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - */ - -#include <linux/quotaops.h> -#include <linux/random.h> - -#include "ext3.h" -#include "xattr.h" -#include "acl.h" - -/* - * ialloc.c contains the inodes allocation and deallocation routines - */ - -/* - * The free inodes are managed by bitmaps. A file system contains several - * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap - * block for inodes, N blocks for the inode table and data blocks. - * - * The file system contains group descriptors which are located after the - * super block. Each descriptor contains the number of the bitmap block and - * the free blocks count in the block. - */ - - -/* - * Read the inode allocation bitmap for a given block_group, reading - * into the specified slot in the superblock's bitmap cache. - * - * Return buffer_head of bitmap on success or NULL. - */ -static struct buffer_head * -read_inode_bitmap(struct super_block * sb, unsigned long block_group) -{ - struct ext3_group_desc *desc; - struct buffer_head *bh = NULL; - - desc = ext3_get_group_desc(sb, block_group, NULL); - if (!desc) - goto error_out; - - bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap)); - if (!bh) - ext3_error(sb, "read_inode_bitmap", - "Cannot read inode bitmap - " - "block_group = %lu, inode_bitmap = %u", - block_group, le32_to_cpu(desc->bg_inode_bitmap)); -error_out: - return bh; -} - -/* - * NOTE! When we get the inode, we're the only people - * that have access to it, and as such there are no - * race conditions we have to worry about. The inode - * is not on the hash-lists, and it cannot be reached - * through the filesystem because the directory entry - * has been deleted earlier. - * - * HOWEVER: we must make sure that we get no aliases, - * which means that we have to call "clear_inode()" - * _before_ we mark the inode not in use in the inode - * bitmaps. Otherwise a newly created file might use - * the same inode number (not actually the same pointer - * though), and then we'd have two inodes sharing the - * same inode number and space on the harddisk. - */ -void ext3_free_inode (handle_t *handle, struct inode * inode) -{ - struct super_block * sb = inode->i_sb; - int is_directory; - unsigned long ino; - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *bh2; - unsigned long block_group; - unsigned long bit; - struct ext3_group_desc * gdp; - struct ext3_super_block * es; - struct ext3_sb_info *sbi; - int fatal = 0, err; - - if (atomic_read(&inode->i_count) > 1) { - printk ("ext3_free_inode: inode has count=%d\n", - atomic_read(&inode->i_count)); - return; - } - if (inode->i_nlink) { - printk ("ext3_free_inode: inode has nlink=%d\n", - inode->i_nlink); - return; - } - if (!sb) { - printk("ext3_free_inode: inode on nonexistent device\n"); - return; - } - sbi = EXT3_SB(sb); - - ino = inode->i_ino; - ext3_debug ("freeing inode %lu\n", ino); - trace_ext3_free_inode(inode); - - is_directory = S_ISDIR(inode->i_mode); - - es = EXT3_SB(sb)->s_es; - if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { - ext3_error (sb, "ext3_free_inode", - "reserved or nonexistent inode %lu", ino); - goto error_return; - } - block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); - bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); - bitmap_bh = read_inode_bitmap(sb, block_group); - if (!bitmap_bh) - goto error_return; - - BUFFER_TRACE(bitmap_bh, "get_write_access"); - fatal = ext3_journal_get_write_access(handle, bitmap_bh); - if (fatal) - goto error_return; - - /* Ok, now we can actually update the inode bitmaps.. */ - if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group), - bit, bitmap_bh->b_data)) - ext3_error (sb, "ext3_free_inode", - "bit already cleared for inode %lu", ino); - else { - gdp = ext3_get_group_desc (sb, block_group, &bh2); - - BUFFER_TRACE(bh2, "get_write_access"); - fatal = ext3_journal_get_write_access(handle, bh2); - if (fatal) goto error_return; - - if (gdp) { - spin_lock(sb_bgl_lock(sbi, block_group)); - le16_add_cpu(&gdp->bg_free_inodes_count, 1); - if (is_directory) - le16_add_cpu(&gdp->bg_used_dirs_count, -1); - spin_unlock(sb_bgl_lock(sbi, block_group)); - percpu_counter_inc(&sbi->s_freeinodes_counter); - if (is_directory) - percpu_counter_dec(&sbi->s_dirs_counter); - - } - BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, bh2); - if (!fatal) fatal = err; - } - BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, bitmap_bh); - if (!fatal) - fatal = err; - -error_return: - brelse(bitmap_bh); - ext3_std_error(sb, fatal); -} - -/* - * Orlov's allocator for directories. - * - * We always try to spread first-level directories. - * - * If there are blockgroups with both free inodes and free blocks counts - * not worse than average we return one with smallest directory count. - * Otherwise we simply return a random group. - * - * For the rest rules look so: - * - * It's OK to put directory into a group unless - * it has too many directories already (max_dirs) or - * it has too few free inodes left (min_inodes) or - * it has too few free blocks left (min_blocks). - * Parent's group is preferred, if it doesn't satisfy these - * conditions we search cyclically through the rest. If none - * of the groups look good we just look for a group with more - * free inodes than average (starting at parent's group). - * - * Debt is incremented each time we allocate a directory and decremented - * when we allocate an inode, within 0--255. - */ - -static int find_group_orlov(struct super_block *sb, struct inode *parent) -{ - int parent_group = EXT3_I(parent)->i_block_group; - struct ext3_sb_info *sbi = EXT3_SB(sb); - int ngroups = sbi->s_groups_count; - int inodes_per_group = EXT3_INODES_PER_GROUP(sb); - unsigned int freei, avefreei; - ext3_fsblk_t freeb, avefreeb; - unsigned int ndirs; - int max_dirs, min_inodes; - ext3_grpblk_t min_blocks; - int group = -1, i; - struct ext3_group_desc *desc; - - freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); - avefreei = freei / ngroups; - freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter); - avefreeb = freeb / ngroups; - ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); - - if ((parent == d_inode(sb->s_root)) || - (EXT3_I(parent)->i_flags & EXT3_TOPDIR_FL)) { - int best_ndir = inodes_per_group; - int best_group = -1; - - group = prandom_u32(); - parent_group = (unsigned)group % ngroups; - for (i = 0; i < ngroups; i++) { - group = (parent_group + i) % ngroups; - desc = ext3_get_group_desc (sb, group, NULL); - if (!desc || !desc->bg_free_inodes_count) - continue; - if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) - continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) - continue; - if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) - continue; - best_group = group; - best_ndir = le16_to_cpu(desc->bg_used_dirs_count); - } - if (best_group >= 0) - return best_group; - goto fallback; - } - - max_dirs = ndirs / ngroups + inodes_per_group / 16; - min_inodes = avefreei - inodes_per_group / 4; - min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4; - - for (i = 0; i < ngroups; i++) { - group = (parent_group + i) % ngroups; - desc = ext3_get_group_desc (sb, group, NULL); - if (!desc || !desc->bg_free_inodes_count) - continue; - if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) - continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes) - continue; - if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) - continue; - return group; - } - -fallback: - for (i = 0; i < ngroups; i++) { - group = (parent_group + i) % ngroups; - desc = ext3_get_group_desc (sb, group, NULL); - if (!desc || !desc->bg_free_inodes_count) - continue; - if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) - return group; - } - - if (avefreei) { - /* - * The free-inodes counter is approximate, and for really small - * filesystems the above test can fail to find any blockgroups - */ - avefreei = 0; - goto fallback; - } - - return -1; -} - -static int find_group_other(struct super_block *sb, struct inode *parent) -{ - int parent_group = EXT3_I(parent)->i_block_group; - int ngroups = EXT3_SB(sb)->s_groups_count; - struct ext3_group_desc *desc; - int group, i; - - /* - * Try to place the inode in its parent directory - */ - group = parent_group; - desc = ext3_get_group_desc (sb, group, NULL); - if (desc && le16_to_cpu(desc->bg_free_inodes_count) && - le16_to_cpu(desc->bg_free_blocks_count)) - return group; - - /* - * We're going to place this inode in a different blockgroup from its - * parent. We want to cause files in a common directory to all land in - * the same blockgroup. But we want files which are in a different - * directory which shares a blockgroup with our parent to land in a - * different blockgroup. - * - * So add our directory's i_ino into the starting point for the hash. - */ - group = (group + parent->i_ino) % ngroups; - - /* - * Use a quadratic hash to find a group with a free inode and some free - * blocks. - */ - for (i = 1; i < ngroups; i <<= 1) { - group += i; - if (group >= ngroups) - group -= ngroups; - desc = ext3_get_group_desc (sb, group, NULL); - if (desc && le16_to_cpu(desc->bg_free_inodes_count) && - le16_to_cpu(desc->bg_free_blocks_count)) - return group; - } - - /* - * That failed: try linear search for a free inode, even if that group - * has no free blocks. - */ - group = parent_group; - for (i = 0; i < ngroups; i++) { - if (++group >= ngroups) - group = 0; - desc = ext3_get_group_desc (sb, group, NULL); - if (desc && le16_to_cpu(desc->bg_free_inodes_count)) - return group; - } - - return -1; -} - -/* - * There are two policies for allocating an inode. If the new inode is - * a directory, then a forward search is made for a block group with both - * free space and a low directory-to-inode ratio; if that fails, then of - * the groups with above-average free space, that group with the fewest - * directories already is chosen. - * - * For other inodes, search forward from the parent directory's block - * group to find a free inode. - */ -struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, - const struct qstr *qstr, umode_t mode) -{ - struct super_block *sb; - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *bh2; - int group; - unsigned long ino = 0; - struct inode * inode; - struct ext3_group_desc * gdp = NULL; - struct ext3_super_block * es; - struct ext3_inode_info *ei; - struct ext3_sb_info *sbi; - int err = 0; - struct inode *ret; - int i; - - /* Cannot create files in a deleted directory */ - if (!dir || !dir->i_nlink) - return ERR_PTR(-EPERM); - - sb = dir->i_sb; - trace_ext3_request_inode(dir, mode); - inode = new_inode(sb); - if (!inode) - return ERR_PTR(-ENOMEM); - ei = EXT3_I(inode); - - sbi = EXT3_SB(sb); - es = sbi->s_es; - if (S_ISDIR(mode)) - group = find_group_orlov(sb, dir); - else - group = find_group_other(sb, dir); - - err = -ENOSPC; - if (group == -1) - goto out; - - for (i = 0; i < sbi->s_groups_count; i++) { - err = -EIO; - - gdp = ext3_get_group_desc(sb, group, &bh2); - if (!gdp) - goto fail; - - brelse(bitmap_bh); - bitmap_bh = read_inode_bitmap(sb, group); - if (!bitmap_bh) - goto fail; - - ino = 0; - -repeat_in_this_group: - ino = ext3_find_next_zero_bit((unsigned long *) - bitmap_bh->b_data, EXT3_INODES_PER_GROUP(sb), ino); - if (ino < EXT3_INODES_PER_GROUP(sb)) { - - BUFFER_TRACE(bitmap_bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, bitmap_bh); - if (err) - goto fail; - - if (!ext3_set_bit_atomic(sb_bgl_lock(sbi, group), - ino, bitmap_bh->b_data)) { - /* we won it */ - BUFFER_TRACE(bitmap_bh, - "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, - bitmap_bh); - if (err) - goto fail; - goto got; - } - /* we lost it */ - journal_release_buffer(handle, bitmap_bh); - - if (++ino < EXT3_INODES_PER_GROUP(sb)) - goto repeat_in_this_group; - } - - /* - * This case is possible in concurrent environment. It is very - * rare. We cannot repeat the find_group_xxx() call because - * that will simply return the same blockgroup, because the - * group descriptor metadata has not yet been updated. - * So we just go onto the next blockgroup. - */ - if (++group == sbi->s_groups_count) - group = 0; - } - err = -ENOSPC; - goto out; - -got: - ino += group * EXT3_INODES_PER_GROUP(sb) + 1; - if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { - ext3_error (sb, "ext3_new_inode", - "reserved inode or inode > inodes count - " - "block_group = %d, inode=%lu", group, ino); - err = -EIO; - goto fail; - } - - BUFFER_TRACE(bh2, "get_write_access"); - err = ext3_journal_get_write_access(handle, bh2); - if (err) goto fail; - spin_lock(sb_bgl_lock(sbi, group)); - le16_add_cpu(&gdp->bg_free_inodes_count, -1); - if (S_ISDIR(mode)) { - le16_add_cpu(&gdp->bg_used_dirs_count, 1); - } - spin_unlock(sb_bgl_lock(sbi, group)); - BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, bh2); - if (err) goto fail; - - percpu_counter_dec(&sbi->s_freeinodes_counter); - if (S_ISDIR(mode)) - percpu_counter_inc(&sbi->s_dirs_counter); - - - if (test_opt(sb, GRPID)) { - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = dir->i_gid; - } else - inode_init_owner(inode, dir, mode); - - inode->i_ino = ino; - /* This is the optimal IO size (for stat), not the fs block size */ - inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; - - memset(ei->i_data, 0, sizeof(ei->i_data)); - ei->i_dir_start_lookup = 0; - ei->i_disksize = 0; - - ei->i_flags = - ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED); -#ifdef EXT3_FRAGMENTS - ei->i_faddr = 0; - ei->i_frag_no = 0; - ei->i_frag_size = 0; -#endif - ei->i_file_acl = 0; - ei->i_dir_acl = 0; - ei->i_dtime = 0; - ei->i_block_alloc_info = NULL; - ei->i_block_group = group; - - ext3_set_inode_flags(inode); - if (IS_DIRSYNC(inode)) - handle->h_sync = 1; - if (insert_inode_locked(inode) < 0) { - /* - * Likely a bitmap corruption causing inode to be allocated - * twice. - */ - err = -EIO; - goto fail; - } - spin_lock(&sbi->s_next_gen_lock); - inode->i_generation = sbi->s_next_generation++; - spin_unlock(&sbi->s_next_gen_lock); - - ei->i_state_flags = 0; - ext3_set_inode_state(inode, EXT3_STATE_NEW); - - /* See comment in ext3_iget for explanation */ - if (ino >= EXT3_FIRST_INO(sb) + 1 && - EXT3_INODE_SIZE(sb) > EXT3_GOOD_OLD_INODE_SIZE) { - ei->i_extra_isize = - sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE; - } else { - ei->i_extra_isize = 0; - } - - ret = inode; - dquot_initialize(inode); - err = dquot_alloc_inode(inode); - if (err) - goto fail_drop; - - err = ext3_init_acl(handle, inode, dir); - if (err) - goto fail_free_drop; - - err = ext3_init_security(handle, inode, dir, qstr); - if (err) - goto fail_free_drop; - - err = ext3_mark_inode_dirty(handle, inode); - if (err) { - ext3_std_error(sb, err); - goto fail_free_drop; - } - - ext3_debug("allocating inode %lu\n", inode->i_ino); - trace_ext3_allocate_inode(inode, dir, mode); - goto really_out; -fail: - ext3_std_error(sb, err); -out: - iput(inode); - ret = ERR_PTR(err); -really_out: - brelse(bitmap_bh); - return ret; - -fail_free_drop: - dquot_free_inode(inode); - -fail_drop: - dquot_drop(inode); - inode->i_flags |= S_NOQUOTA; - clear_nlink(inode); - unlock_new_inode(inode); - iput(inode); - brelse(bitmap_bh); - return ERR_PTR(err); -} - -/* Verify that we are loading a valid orphan from disk */ -struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino) -{ - unsigned long max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count); - unsigned long block_group; - int bit; - struct buffer_head *bitmap_bh; - struct inode *inode = NULL; - long err = -EIO; - - /* Error cases - e2fsck has already cleaned up for us */ - if (ino > max_ino) { - ext3_warning(sb, __func__, - "bad orphan ino %lu! e2fsck was run?", ino); - goto error; - } - - block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); - bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); - bitmap_bh = read_inode_bitmap(sb, block_group); - if (!bitmap_bh) { - ext3_warning(sb, __func__, - "inode bitmap error for orphan %lu", ino); - goto error; - } - - /* Having the inode bit set should be a 100% indicator that this - * is a valid orphan (no e2fsck run on fs). Orphans also include - * inodes that were being truncated, so we can't check i_nlink==0. - */ - if (!ext3_test_bit(bit, bitmap_bh->b_data)) - goto bad_orphan; - - inode = ext3_iget(sb, ino); - if (IS_ERR(inode)) - goto iget_failed; - - /* - * If the orphans has i_nlinks > 0 then it should be able to be - * truncated, otherwise it won't be removed from the orphan list - * during processing and an infinite loop will result. - */ - if (inode->i_nlink && !ext3_can_truncate(inode)) - goto bad_orphan; - - if (NEXT_ORPHAN(inode) > max_ino) - goto bad_orphan; - brelse(bitmap_bh); - return inode; - -iget_failed: - err = PTR_ERR(inode); - inode = NULL; -bad_orphan: - ext3_warning(sb, __func__, - "bad orphan inode %lu! e2fsck was run?", ino); - printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n", - bit, (unsigned long long)bitmap_bh->b_blocknr, - ext3_test_bit(bit, bitmap_bh->b_data)); - printk(KERN_NOTICE "inode=%p\n", inode); - if (inode) { - printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", - is_bad_inode(inode)); - printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", - NEXT_ORPHAN(inode)); - printk(KERN_NOTICE "max_ino=%lu\n", max_ino); - printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); - /* Avoid freeing blocks if we got a bad deleted inode */ - if (inode->i_nlink == 0) - inode->i_blocks = 0; - iput(inode); - } - brelse(bitmap_bh); -error: - return ERR_PTR(err); -} - -unsigned long ext3_count_free_inodes (struct super_block * sb) -{ - unsigned long desc_count; - struct ext3_group_desc *gdp; - int i; -#ifdef EXT3FS_DEBUG - struct ext3_super_block *es; - unsigned long bitmap_count, x; - struct buffer_head *bitmap_bh = NULL; - - es = EXT3_SB(sb)->s_es; - desc_count = 0; - bitmap_count = 0; - gdp = NULL; - for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { - gdp = ext3_get_group_desc (sb, i, NULL); - if (!gdp) - continue; - desc_count += le16_to_cpu(gdp->bg_free_inodes_count); - brelse(bitmap_bh); - bitmap_bh = read_inode_bitmap(sb, i); - if (!bitmap_bh) - continue; - - x = ext3_count_free(bitmap_bh, EXT3_INODES_PER_GROUP(sb) / 8); - printk("group %d: stored = %d, counted = %lu\n", - i, le16_to_cpu(gdp->bg_free_inodes_count), x); - bitmap_count += x; - } - brelse(bitmap_bh); - printk("ext3_count_free_inodes: stored = %u, computed = %lu, %lu\n", - le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); - return desc_count; -#else - desc_count = 0; - for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { - gdp = ext3_get_group_desc (sb, i, NULL); - if (!gdp) - continue; - desc_count += le16_to_cpu(gdp->bg_free_inodes_count); - cond_resched(); - } - return desc_count; -#endif -} - -/* Called at mount-time, super-block is locked */ -unsigned long ext3_count_dirs (struct super_block * sb) -{ - unsigned long count = 0; - int i; - - for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { - struct ext3_group_desc *gdp = ext3_get_group_desc (sb, i, NULL); - if (!gdp) - continue; - count += le16_to_cpu(gdp->bg_used_dirs_count); - } - return count; -} - diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c deleted file mode 100644 index 6c7e5468a2f8..000000000000 --- a/fs/ext3/inode.c +++ /dev/null @@ -1,3574 +0,0 @@ -/* - * linux/fs/ext3/inode.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/inode.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Goal-directed block allocation by Stephen Tweedie - * (sct@redhat.com), 1993, 1998 - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - * 64-bit file support on 64-bit platforms by Jakub Jelinek - * (jj@sunsite.ms.mff.cuni.cz) - * - * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 - */ - -#include <linux/highuid.h> -#include <linux/quotaops.h> -#include <linux/writeback.h> -#include <linux/mpage.h> -#include <linux/namei.h> -#include <linux/uio.h> -#include "ext3.h" -#include "xattr.h" -#include "acl.h" - -static int ext3_writepage_trans_blocks(struct inode *inode); -static int ext3_block_truncate_page(struct inode *inode, loff_t from); - -/* - * Test whether an inode is a fast symlink. - */ -static int ext3_inode_is_fast_symlink(struct inode *inode) -{ - int ea_blocks = EXT3_I(inode)->i_file_acl ? - (inode->i_sb->s_blocksize >> 9) : 0; - - return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); -} - -/* - * The ext3 forget function must perform a revoke if we are freeing data - * which has been journaled. Metadata (eg. indirect blocks) must be - * revoked in all cases. - * - * "bh" may be NULL: a metadata block may have been freed from memory - * but there may still be a record of it in the journal, and that record - * still needs to be revoked. - */ -int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, - struct buffer_head *bh, ext3_fsblk_t blocknr) -{ - int err; - - might_sleep(); - - trace_ext3_forget(inode, is_metadata, blocknr); - BUFFER_TRACE(bh, "enter"); - - jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " - "data mode %lx\n", - bh, is_metadata, inode->i_mode, - test_opt(inode->i_sb, DATA_FLAGS)); - - /* Never use the revoke function if we are doing full data - * journaling: there is no need to, and a V1 superblock won't - * support it. Otherwise, only skip the revoke on un-journaled - * data blocks. */ - - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA || - (!is_metadata && !ext3_should_journal_data(inode))) { - if (bh) { - BUFFER_TRACE(bh, "call journal_forget"); - return ext3_journal_forget(handle, bh); - } - return 0; - } - - /* - * data!=journal && (is_metadata || should_journal_data(inode)) - */ - BUFFER_TRACE(bh, "call ext3_journal_revoke"); - err = ext3_journal_revoke(handle, blocknr, bh); - if (err) - ext3_abort(inode->i_sb, __func__, - "error %d when attempting revoke", err); - BUFFER_TRACE(bh, "exit"); - return err; -} - -/* - * Work out how many blocks we need to proceed with the next chunk of a - * truncate transaction. - */ -static unsigned long blocks_for_truncate(struct inode *inode) -{ - unsigned long needed; - - needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); - - /* Give ourselves just enough room to cope with inodes in which - * i_blocks is corrupt: we've seen disk corruptions in the past - * which resulted in random data in an inode which looked enough - * like a regular file for ext3 to try to delete it. Things - * will go a bit crazy if that happens, but at least we should - * try not to panic the whole kernel. */ - if (needed < 2) - needed = 2; - - /* But we need to bound the transaction so we don't overflow the - * journal. */ - if (needed > EXT3_MAX_TRANS_DATA) - needed = EXT3_MAX_TRANS_DATA; - - return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed; -} - -/* - * Truncate transactions can be complex and absolutely huge. So we need to - * be able to restart the transaction at a conventient checkpoint to make - * sure we don't overflow the journal. - * - * start_transaction gets us a new handle for a truncate transaction, - * and extend_transaction tries to extend the existing one a bit. If - * extend fails, we need to propagate the failure up and restart the - * transaction in the top-level truncate loop. --sct - */ -static handle_t *start_transaction(struct inode *inode) -{ - handle_t *result; - - result = ext3_journal_start(inode, blocks_for_truncate(inode)); - if (!IS_ERR(result)) - return result; - - ext3_std_error(inode->i_sb, PTR_ERR(result)); - return result; -} - -/* - * Try to extend this transaction for the purposes of truncation. - * - * Returns 0 if we managed to create more room. If we can't create more - * room, and the transaction must be restarted we return 1. - */ -static int try_to_extend_transaction(handle_t *handle, struct inode *inode) -{ - if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS) - return 0; - if (!ext3_journal_extend(handle, blocks_for_truncate(inode))) - return 0; - return 1; -} - -/* - * Restart the transaction associated with *handle. This does a commit, - * so before we call here everything must be consistently dirtied against - * this transaction. - */ -static int truncate_restart_transaction(handle_t *handle, struct inode *inode) -{ - int ret; - - jbd_debug(2, "restarting handle %p\n", handle); - /* - * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle - * At this moment, get_block can be called only for blocks inside - * i_size since page cache has been already dropped and writes are - * blocked by i_mutex. So we can safely drop the truncate_mutex. - */ - mutex_unlock(&EXT3_I(inode)->truncate_mutex); - ret = ext3_journal_restart(handle, blocks_for_truncate(inode)); - mutex_lock(&EXT3_I(inode)->truncate_mutex); - return ret; -} - -/* - * Called at inode eviction from icache - */ -void ext3_evict_inode (struct inode *inode) -{ - struct ext3_inode_info *ei = EXT3_I(inode); - struct ext3_block_alloc_info *rsv; - handle_t *handle; - int want_delete = 0; - - trace_ext3_evict_inode(inode); - if (!inode->i_nlink && !is_bad_inode(inode)) { - dquot_initialize(inode); - want_delete = 1; - } - - /* - * When journalling data dirty buffers are tracked only in the journal. - * So although mm thinks everything is clean and ready for reaping the - * inode might still have some pages to write in the running - * transaction or waiting to be checkpointed. Thus calling - * journal_invalidatepage() (via truncate_inode_pages()) to discard - * these buffers can cause data loss. Also even if we did not discard - * these buffers, we would have no way to find them after the inode - * is reaped and thus user could see stale data if he tries to read - * them before the transaction is checkpointed. So be careful and - * force everything to disk here... We use ei->i_datasync_tid to - * store the newest transaction containing inode's data. - * - * Note that directories do not have this problem because they don't - * use page cache. - * - * The s_journal check handles the case when ext3_get_journal() fails - * and puts the journal inode. - */ - if (inode->i_nlink && ext3_should_journal_data(inode) && - EXT3_SB(inode->i_sb)->s_journal && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && - inode->i_ino != EXT3_JOURNAL_INO) { - tid_t commit_tid = atomic_read(&ei->i_datasync_tid); - journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; - - log_start_commit(journal, commit_tid); - log_wait_commit(journal, commit_tid); - filemap_write_and_wait(&inode->i_data); - } - truncate_inode_pages_final(&inode->i_data); - - ext3_discard_reservation(inode); - rsv = ei->i_block_alloc_info; - ei->i_block_alloc_info = NULL; - if (unlikely(rsv)) - kfree(rsv); - - if (!want_delete) - goto no_delete; - - handle = start_transaction(inode); - if (IS_ERR(handle)) { - /* - * If we're going to skip the normal cleanup, we still need to - * make sure that the in-core orphan linked list is properly - * cleaned up. - */ - ext3_orphan_del(NULL, inode); - goto no_delete; - } - - if (IS_SYNC(inode)) - handle->h_sync = 1; - inode->i_size = 0; - if (inode->i_blocks) - ext3_truncate(inode); - /* - * Kill off the orphan record created when the inode lost the last - * link. Note that ext3_orphan_del() has to be able to cope with the - * deletion of a non-existent orphan - ext3_truncate() could - * have removed the record. - */ - ext3_orphan_del(handle, inode); - ei->i_dtime = get_seconds(); - - /* - * One subtle ordering requirement: if anything has gone wrong - * (transaction abort, IO errors, whatever), then we can still - * do these next steps (the fs will already have been marked as - * having errors), but we can't free the inode if the mark_dirty - * fails. - */ - if (ext3_mark_inode_dirty(handle, inode)) { - /* If that failed, just dquot_drop() and be done with that */ - dquot_drop(inode); - clear_inode(inode); - } else { - ext3_xattr_delete_inode(handle, inode); - dquot_free_inode(inode); - dquot_drop(inode); - clear_inode(inode); - ext3_free_inode(handle, inode); - } - ext3_journal_stop(handle); - return; -no_delete: - clear_inode(inode); - dquot_drop(inode); -} - -typedef struct { - __le32 *p; - __le32 key; - struct buffer_head *bh; -} Indirect; - -static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) -{ - p->key = *(p->p = v); - p->bh = bh; -} - -static int verify_chain(Indirect *from, Indirect *to) -{ - while (from <= to && from->key == *from->p) - from++; - return (from > to); -} - -/** - * ext3_block_to_path - parse the block number into array of offsets - * @inode: inode in question (we are only interested in its superblock) - * @i_block: block number to be parsed - * @offsets: array to store the offsets in - * @boundary: set this non-zero if the referred-to block is likely to be - * followed (on disk) by an indirect block. - * - * To store the locations of file's data ext3 uses a data structure common - * for UNIX filesystems - tree of pointers anchored in the inode, with - * data blocks at leaves and indirect blocks in intermediate nodes. - * This function translates the block number into path in that tree - - * return value is the path length and @offsets[n] is the offset of - * pointer to (n+1)th node in the nth one. If @block is out of range - * (negative or too large) warning is printed and zero returned. - * - * Note: function doesn't find node addresses, so no IO is needed. All - * we need to know is the capacity of indirect blocks (taken from the - * inode->i_sb). - */ - -/* - * Portability note: the last comparison (check that we fit into triple - * indirect block) is spelled differently, because otherwise on an - * architecture with 32-bit longs and 8Kb pages we might get into trouble - * if our filesystem had 8Kb blocks. We might use long long, but that would - * kill us on x86. Oh, well, at least the sign propagation does not matter - - * i_block would have to be negative in the very beginning, so we would not - * get there at all. - */ - -static int ext3_block_to_path(struct inode *inode, - long i_block, int offsets[4], int *boundary) -{ - int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb); - int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb); - const long direct_blocks = EXT3_NDIR_BLOCKS, - indirect_blocks = ptrs, - double_blocks = (1 << (ptrs_bits * 2)); - int n = 0; - int final = 0; - - if (i_block < 0) { - ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0"); - } else if (i_block < direct_blocks) { - offsets[n++] = i_block; - final = direct_blocks; - } else if ( (i_block -= direct_blocks) < indirect_blocks) { - offsets[n++] = EXT3_IND_BLOCK; - offsets[n++] = i_block; - final = ptrs; - } else if ((i_block -= indirect_blocks) < double_blocks) { - offsets[n++] = EXT3_DIND_BLOCK; - offsets[n++] = i_block >> ptrs_bits; - offsets[n++] = i_block & (ptrs - 1); - final = ptrs; - } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { - offsets[n++] = EXT3_TIND_BLOCK; - offsets[n++] = i_block >> (ptrs_bits * 2); - offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); - offsets[n++] = i_block & (ptrs - 1); - final = ptrs; - } else { - ext3_warning(inode->i_sb, "ext3_block_to_path", "block > big"); - } - if (boundary) - *boundary = final - 1 - (i_block & (ptrs - 1)); - return n; -} - -/** - * ext3_get_branch - read the chain of indirect blocks leading to data - * @inode: inode in question - * @depth: depth of the chain (1 - direct pointer, etc.) - * @offsets: offsets of pointers in inode/indirect blocks - * @chain: place to store the result - * @err: here we store the error value - * - * Function fills the array of triples <key, p, bh> and returns %NULL - * if everything went OK or the pointer to the last filled triple - * (incomplete one) otherwise. Upon the return chain[i].key contains - * the number of (i+1)-th block in the chain (as it is stored in memory, - * i.e. little-endian 32-bit), chain[i].p contains the address of that - * number (it points into struct inode for i==0 and into the bh->b_data - * for i>0) and chain[i].bh points to the buffer_head of i-th indirect - * block for i>0 and NULL for i==0. In other words, it holds the block - * numbers of the chain, addresses they were taken from (and where we can - * verify that chain did not change) and buffer_heads hosting these - * numbers. - * - * Function stops when it stumbles upon zero pointer (absent block) - * (pointer to last triple returned, *@err == 0) - * or when it gets an IO error reading an indirect block - * (ditto, *@err == -EIO) - * or when it notices that chain had been changed while it was reading - * (ditto, *@err == -EAGAIN) - * or when it reads all @depth-1 indirect blocks successfully and finds - * the whole chain, all way to the data (returns %NULL, *err == 0). - */ -static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets, - Indirect chain[4], int *err) -{ - struct super_block *sb = inode->i_sb; - Indirect *p = chain; - struct buffer_head *bh; - - *err = 0; - /* i_data is not going away, no lock needed */ - add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets); - if (!p->key) - goto no_block; - while (--depth) { - bh = sb_bread(sb, le32_to_cpu(p->key)); - if (!bh) - goto failure; - /* Reader: pointers */ - if (!verify_chain(chain, p)) - goto changed; - add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); - /* Reader: end */ - if (!p->key) - goto no_block; - } - return NULL; - -changed: - brelse(bh); - *err = -EAGAIN; - goto no_block; -failure: - *err = -EIO; -no_block: - return p; -} - -/** - * ext3_find_near - find a place for allocation with sufficient locality - * @inode: owner - * @ind: descriptor of indirect block. - * - * This function returns the preferred place for block allocation. - * It is used when heuristic for sequential allocation fails. - * Rules are: - * + if there is a block to the left of our position - allocate near it. - * + if pointer will live in indirect block - allocate near that block. - * + if pointer will live in inode - allocate in the same - * cylinder group. - * - * In the latter case we colour the starting block by the callers PID to - * prevent it from clashing with concurrent allocations for a different inode - * in the same block group. The PID is used here so that functionally related - * files will be close-by on-disk. - * - * Caller must make sure that @ind is valid and will stay that way. - */ -static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind) -{ - struct ext3_inode_info *ei = EXT3_I(inode); - __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; - __le32 *p; - ext3_fsblk_t bg_start; - ext3_grpblk_t colour; - - /* Try to find previous block */ - for (p = ind->p - 1; p >= start; p--) { - if (*p) - return le32_to_cpu(*p); - } - - /* No such thing, so let's try location of indirect block */ - if (ind->bh) - return ind->bh->b_blocknr; - - /* - * It is going to be referred to from the inode itself? OK, just put it - * into the same cylinder group then. - */ - bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group); - colour = (current->pid % 16) * - (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); - return bg_start + colour; -} - -/** - * ext3_find_goal - find a preferred place for allocation. - * @inode: owner - * @block: block we want - * @partial: pointer to the last triple within a chain - * - * Normally this function find the preferred place for block allocation, - * returns it. - */ - -static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block, - Indirect *partial) -{ - struct ext3_block_alloc_info *block_i; - - block_i = EXT3_I(inode)->i_block_alloc_info; - - /* - * try the heuristic for sequential allocation, - * failing that at least try to get decent locality. - */ - if (block_i && (block == block_i->last_alloc_logical_block + 1) - && (block_i->last_alloc_physical_block != 0)) { - return block_i->last_alloc_physical_block + 1; - } - - return ext3_find_near(inode, partial); -} - -/** - * ext3_blks_to_allocate - Look up the block map and count the number - * of direct blocks need to be allocated for the given branch. - * - * @branch: chain of indirect blocks - * @k: number of blocks need for indirect blocks - * @blks: number of data blocks to be mapped. - * @blocks_to_boundary: the offset in the indirect block - * - * return the total number of blocks to be allocate, including the - * direct and indirect blocks. - */ -static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks, - int blocks_to_boundary) -{ - unsigned long count = 0; - - /* - * Simple case, [t,d]Indirect block(s) has not allocated yet - * then it's clear blocks on that path have not allocated - */ - if (k > 0) { - /* right now we don't handle cross boundary allocation */ - if (blks < blocks_to_boundary + 1) - count += blks; - else - count += blocks_to_boundary + 1; - return count; - } - - count++; - while (count < blks && count <= blocks_to_boundary && - le32_to_cpu(*(branch[0].p + count)) == 0) { - count++; - } - return count; -} - -/** - * ext3_alloc_blocks - multiple allocate blocks needed for a branch - * @handle: handle for this transaction - * @inode: owner - * @goal: preferred place for allocation - * @indirect_blks: the number of blocks need to allocate for indirect - * blocks - * @blks: number of blocks need to allocated for direct blocks - * @new_blocks: on return it will store the new block numbers for - * the indirect blocks(if needed) and the first direct block, - * @err: here we store the error value - * - * return the number of direct blocks allocated - */ -static int ext3_alloc_blocks(handle_t *handle, struct inode *inode, - ext3_fsblk_t goal, int indirect_blks, int blks, - ext3_fsblk_t new_blocks[4], int *err) -{ - int target, i; - unsigned long count = 0; - int index = 0; - ext3_fsblk_t current_block = 0; - int ret = 0; - - /* - * Here we try to allocate the requested multiple blocks at once, - * on a best-effort basis. - * To build a branch, we should allocate blocks for - * the indirect blocks(if not allocated yet), and at least - * the first direct block of this branch. That's the - * minimum number of blocks need to allocate(required) - */ - target = blks + indirect_blks; - - while (1) { - count = target; - /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext3_new_blocks(handle,inode,goal,&count,err); - if (*err) - goto failed_out; - - target -= count; - /* allocate blocks for indirect blocks */ - while (index < indirect_blks && count) { - new_blocks[index++] = current_block++; - count--; - } - - if (count > 0) - break; - } - - /* save the new block number for the first direct block */ - new_blocks[index] = current_block; - - /* total number of blocks allocated for direct blocks */ - ret = count; - *err = 0; - return ret; -failed_out: - for (i = 0; i <index; i++) - ext3_free_blocks(handle, inode, new_blocks[i], 1); - return ret; -} - -/** - * ext3_alloc_branch - allocate and set up a chain of blocks. - * @handle: handle for this transaction - * @inode: owner - * @indirect_blks: number of allocated indirect blocks - * @blks: number of allocated direct blocks - * @goal: preferred place for allocation - * @offsets: offsets (in the blocks) to store the pointers to next. - * @branch: place to store the chain in. - * - * This function allocates blocks, zeroes out all but the last one, - * links them into chain and (if we are synchronous) writes them to disk. - * In other words, it prepares a branch that can be spliced onto the - * inode. It stores the information about that chain in the branch[], in - * the same format as ext3_get_branch() would do. We are calling it after - * we had read the existing part of chain and partial points to the last - * triple of that (one with zero ->key). Upon the exit we have the same - * picture as after the successful ext3_get_block(), except that in one - * place chain is disconnected - *branch->p is still zero (we did not - * set the last link), but branch->key contains the number that should - * be placed into *branch->p to fill that gap. - * - * If allocation fails we free all blocks we've allocated (and forget - * their buffer_heads) and return the error value the from failed - * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain - * as described above and return 0. - */ -static int ext3_alloc_branch(handle_t *handle, struct inode *inode, - int indirect_blks, int *blks, ext3_fsblk_t goal, - int *offsets, Indirect *branch) -{ - int blocksize = inode->i_sb->s_blocksize; - int i, n = 0; - int err = 0; - struct buffer_head *bh; - int num; - ext3_fsblk_t new_blocks[4]; - ext3_fsblk_t current_block; - - num = ext3_alloc_blocks(handle, inode, goal, indirect_blks, - *blks, new_blocks, &err); - if (err) - return err; - - branch[0].key = cpu_to_le32(new_blocks[0]); - /* - * metadata blocks and data blocks are allocated. - */ - for (n = 1; n <= indirect_blks; n++) { - /* - * Get buffer_head for parent block, zero it out - * and set the pointer to new one, then send - * parent to disk. - */ - bh = sb_getblk(inode->i_sb, new_blocks[n-1]); - if (unlikely(!bh)) { - err = -ENOMEM; - goto failed; - } - branch[n].bh = bh; - lock_buffer(bh); - BUFFER_TRACE(bh, "call get_create_access"); - err = ext3_journal_get_create_access(handle, bh); - if (err) { - unlock_buffer(bh); - brelse(bh); - goto failed; - } - - memset(bh->b_data, 0, blocksize); - branch[n].p = (__le32 *) bh->b_data + offsets[n]; - branch[n].key = cpu_to_le32(new_blocks[n]); - *branch[n].p = branch[n].key; - if ( n == indirect_blks) { - current_block = new_blocks[n]; - /* - * End of chain, update the last new metablock of - * the chain to point to the new allocated - * data blocks numbers - */ - for (i=1; i < num; i++) - *(branch[n].p + i) = cpu_to_le32(++current_block); - } - BUFFER_TRACE(bh, "marking uptodate"); - set_buffer_uptodate(bh); - unlock_buffer(bh); - - BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, bh); - if (err) - goto failed; - } - *blks = num; - return err; -failed: - /* Allocation failed, free what we already allocated */ - for (i = 1; i <= n ; i++) { - BUFFER_TRACE(branch[i].bh, "call journal_forget"); - ext3_journal_forget(handle, branch[i].bh); - } - for (i = 0; i < indirect_blks; i++) - ext3_free_blocks(handle, inode, new_blocks[i], 1); - - ext3_free_blocks(handle, inode, new_blocks[i], num); - - return err; -} - -/** - * ext3_splice_branch - splice the allocated branch onto inode. - * @handle: handle for this transaction - * @inode: owner - * @block: (logical) number of block we are adding - * @where: location of missing link - * @num: number of indirect blocks we are adding - * @blks: number of direct blocks we are adding - * - * This function fills the missing link and does all housekeeping needed in - * inode (->i_blocks, etc.). In case of success we end up with the full - * chain to new block and return 0. - */ -static int ext3_splice_branch(handle_t *handle, struct inode *inode, - long block, Indirect *where, int num, int blks) -{ - int i; - int err = 0; - struct ext3_block_alloc_info *block_i; - ext3_fsblk_t current_block; - struct ext3_inode_info *ei = EXT3_I(inode); - struct timespec now; - - block_i = ei->i_block_alloc_info; - /* - * If we're splicing into a [td]indirect block (as opposed to the - * inode) then we need to get write access to the [td]indirect block - * before the splice. - */ - if (where->bh) { - BUFFER_TRACE(where->bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, where->bh); - if (err) - goto err_out; - } - /* That's it */ - - *where->p = where->key; - - /* - * Update the host buffer_head or inode to point to more just allocated - * direct blocks blocks - */ - if (num == 0 && blks > 1) { - current_block = le32_to_cpu(where->key) + 1; - for (i = 1; i < blks; i++) - *(where->p + i ) = cpu_to_le32(current_block++); - } - - /* - * update the most recently allocated logical & physical block - * in i_block_alloc_info, to assist find the proper goal block for next - * allocation - */ - if (block_i) { - block_i->last_alloc_logical_block = block + blks - 1; - block_i->last_alloc_physical_block = - le32_to_cpu(where[num].key) + blks - 1; - } - - /* We are done with atomic stuff, now do the rest of housekeeping */ - now = CURRENT_TIME_SEC; - if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) { - inode->i_ctime = now; - ext3_mark_inode_dirty(handle, inode); - } - /* ext3_mark_inode_dirty already updated i_sync_tid */ - atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); - - /* had we spliced it onto indirect block? */ - if (where->bh) { - /* - * If we spliced it onto an indirect block, we haven't - * altered the inode. Note however that if it is being spliced - * onto an indirect block at the very end of the file (the - * file is growing) then we *will* alter the inode to reflect - * the new i_size. But that is not done here - it is done in - * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode. - */ - jbd_debug(5, "splicing indirect only\n"); - BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, where->bh); - if (err) - goto err_out; - } else { - /* - * OK, we spliced it into the inode itself on a direct block. - * Inode was dirtied above. - */ - jbd_debug(5, "splicing direct\n"); - } - return err; - -err_out: - for (i = 1; i <= num; i++) { - BUFFER_TRACE(where[i].bh, "call journal_forget"); - ext3_journal_forget(handle, where[i].bh); - ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1); - } - ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks); - - return err; -} - -/* - * Allocation strategy is simple: if we have to allocate something, we will - * have to go the whole way to leaf. So let's do it before attaching anything - * to tree, set linkage between the newborn blocks, write them if sync is - * required, recheck the path, free and repeat if check fails, otherwise - * set the last missing link (that will protect us from any truncate-generated - * removals - all blocks on the path are immune now) and possibly force the - * write on the parent block. - * That has a nice additional property: no special recovery from the failed - * allocations is needed - we simply release blocks and do not touch anything - * reachable from inode. - * - * `handle' can be NULL if create == 0. - * - * The BKL may not be held on entry here. Be sure to take it early. - * return > 0, # of blocks mapped or allocated. - * return = 0, if plain lookup failed. - * return < 0, error case. - */ -int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, - sector_t iblock, unsigned long maxblocks, - struct buffer_head *bh_result, - int create) -{ - int err = -EIO; - int offsets[4]; - Indirect chain[4]; - Indirect *partial; - ext3_fsblk_t goal; - int indirect_blks; - int blocks_to_boundary = 0; - int depth; - struct ext3_inode_info *ei = EXT3_I(inode); - int count = 0; - ext3_fsblk_t first_block = 0; - - - trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create); - J_ASSERT(handle != NULL || create == 0); - depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary); - - if (depth == 0) - goto out; - - partial = ext3_get_branch(inode, depth, offsets, chain, &err); - - /* Simplest case - block found, no allocation needed */ - if (!partial) { - first_block = le32_to_cpu(chain[depth - 1].key); - clear_buffer_new(bh_result); - count++; - /*map more blocks*/ - while (count < maxblocks && count <= blocks_to_boundary) { - ext3_fsblk_t blk; - - if (!verify_chain(chain, chain + depth - 1)) { - /* - * Indirect block might be removed by - * truncate while we were reading it. - * Handling of that case: forget what we've - * got now. Flag the err as EAGAIN, so it - * will reread. - */ - err = -EAGAIN; - count = 0; - break; - } - blk = le32_to_cpu(*(chain[depth-1].p + count)); - - if (blk == first_block + count) - count++; - else - break; - } - if (err != -EAGAIN) - goto got_it; - } - - /* Next simple case - plain lookup or failed read of indirect block */ - if (!create || err == -EIO) - goto cleanup; - - /* - * Block out ext3_truncate while we alter the tree - */ - mutex_lock(&ei->truncate_mutex); - - /* - * If the indirect block is missing while we are reading - * the chain(ext3_get_branch() returns -EAGAIN err), or - * if the chain has been changed after we grab the semaphore, - * (either because another process truncated this branch, or - * another get_block allocated this branch) re-grab the chain to see if - * the request block has been allocated or not. - * - * Since we already block the truncate/other get_block - * at this point, we will have the current copy of the chain when we - * splice the branch into the tree. - */ - if (err == -EAGAIN || !verify_chain(chain, partial)) { - while (partial > chain) { - brelse(partial->bh); - partial--; - } - partial = ext3_get_branch(inode, depth, offsets, chain, &err); - if (!partial) { - count++; - mutex_unlock(&ei->truncate_mutex); - if (err) - goto cleanup; - clear_buffer_new(bh_result); - goto got_it; - } - } - - /* - * Okay, we need to do block allocation. Lazily initialize the block - * allocation info here if necessary - */ - if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info)) - ext3_init_block_alloc_info(inode); - - goal = ext3_find_goal(inode, iblock, partial); - - /* the number of blocks need to allocate for [d,t]indirect blocks */ - indirect_blks = (chain + depth) - partial - 1; - - /* - * Next look up the indirect map to count the totoal number of - * direct blocks to allocate for this branch. - */ - count = ext3_blks_to_allocate(partial, indirect_blks, - maxblocks, blocks_to_boundary); - err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal, - offsets + (partial - chain), partial); - - /* - * The ext3_splice_branch call will free and forget any buffers - * on the new chain if there is a failure, but that risks using - * up transaction credits, especially for bitmaps where the - * credits cannot be returned. Can we handle this somehow? We - * may need to return -EAGAIN upwards in the worst case. --sct - */ - if (!err) - err = ext3_splice_branch(handle, inode, iblock, - partial, indirect_blks, count); - mutex_unlock(&ei->truncate_mutex); - if (err) - goto cleanup; - - set_buffer_new(bh_result); -got_it: - map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); - if (count > blocks_to_boundary) - set_buffer_boundary(bh_result); - err = count; - /* Clean up and exit */ - partial = chain + depth - 1; /* the whole chain */ -cleanup: - while (partial > chain) { - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - partial--; - } - BUFFER_TRACE(bh_result, "returned"); -out: - trace_ext3_get_blocks_exit(inode, iblock, - depth ? le32_to_cpu(chain[depth-1].key) : 0, - count, err); - return err; -} - -/* Maximum number of blocks we map for direct IO at once. */ -#define DIO_MAX_BLOCKS 4096 -/* - * Number of credits we need for writing DIO_MAX_BLOCKS: - * We need sb + group descriptor + bitmap + inode -> 4 - * For B blocks with A block pointers per block we need: - * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect). - * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25. - */ -#define DIO_CREDITS 25 - -static int ext3_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - handle_t *handle = ext3_journal_current_handle(); - int ret = 0, started = 0; - unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; - - if (create && !handle) { /* Direct IO write... */ - if (max_blocks > DIO_MAX_BLOCKS) - max_blocks = DIO_MAX_BLOCKS; - handle = ext3_journal_start(inode, DIO_CREDITS + - EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - started = 1; - } - - ret = ext3_get_blocks_handle(handle, inode, iblock, - max_blocks, bh_result, create); - if (ret > 0) { - bh_result->b_size = (ret << inode->i_blkbits); - ret = 0; - } - if (started) - ext3_journal_stop(handle); -out: - return ret; -} - -int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - u64 start, u64 len) -{ - return generic_block_fiemap(inode, fieinfo, start, len, - ext3_get_block); -} - -/* - * `handle' can be NULL if create is zero - */ -struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, - long block, int create, int *errp) -{ - struct buffer_head dummy; - int fatal = 0, err; - - J_ASSERT(handle != NULL || create == 0); - - dummy.b_state = 0; - dummy.b_blocknr = -1000; - buffer_trace_init(&dummy.b_history); - err = ext3_get_blocks_handle(handle, inode, block, 1, - &dummy, create); - /* - * ext3_get_blocks_handle() returns number of blocks - * mapped. 0 in case of a HOLE. - */ - if (err > 0) { - WARN_ON(err > 1); - err = 0; - } - *errp = err; - if (!err && buffer_mapped(&dummy)) { - struct buffer_head *bh; - bh = sb_getblk(inode->i_sb, dummy.b_blocknr); - if (unlikely(!bh)) { - *errp = -ENOMEM; - goto err; - } - if (buffer_new(&dummy)) { - J_ASSERT(create != 0); - J_ASSERT(handle != NULL); - - /* - * Now that we do not always journal data, we should - * keep in mind whether this should always journal the - * new buffer as metadata. For now, regular file - * writes use ext3_get_block instead, so it's not a - * problem. - */ - lock_buffer(bh); - BUFFER_TRACE(bh, "call get_create_access"); - fatal = ext3_journal_get_create_access(handle, bh); - if (!fatal && !buffer_uptodate(bh)) { - memset(bh->b_data,0,inode->i_sb->s_blocksize); - set_buffer_uptodate(bh); - } - unlock_buffer(bh); - BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, bh); - if (!fatal) - fatal = err; - } else { - BUFFER_TRACE(bh, "not a new buffer"); - } - if (fatal) { - *errp = fatal; - brelse(bh); - bh = NULL; - } - return bh; - } -err: - return NULL; -} - -struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode, - int block, int create, int *err) -{ - struct buffer_head * bh; - - bh = ext3_getblk(handle, inode, block, create, err); - if (!bh) - return bh; - if (bh_uptodate_or_lock(bh)) - return bh; - get_bh(bh); - bh->b_end_io = end_buffer_read_sync; - submit_bh(READ | REQ_META | REQ_PRIO, bh); - wait_on_buffer(bh); - if (buffer_uptodate(bh)) - return bh; - put_bh(bh); - *err = -EIO; - return NULL; -} - -static int walk_page_buffers( handle_t *handle, - struct buffer_head *head, - unsigned from, - unsigned to, - int *partial, - int (*fn)( handle_t *handle, - struct buffer_head *bh)) -{ - struct buffer_head *bh; - unsigned block_start, block_end; - unsigned blocksize = head->b_size; - int err, ret = 0; - struct buffer_head *next; - - for ( bh = head, block_start = 0; - ret == 0 && (bh != head || !block_start); - block_start = block_end, bh = next) - { - next = bh->b_this_page; - block_end = block_start + blocksize; - if (block_end <= from || block_start >= to) { - if (partial && !buffer_uptodate(bh)) - *partial = 1; - continue; - } - err = (*fn)(handle, bh); - if (!ret) - ret = err; - } - return ret; -} - -/* - * To preserve ordering, it is essential that the hole instantiation and - * the data write be encapsulated in a single transaction. We cannot - * close off a transaction and start a new one between the ext3_get_block() - * and the commit_write(). So doing the journal_start at the start of - * prepare_write() is the right place. - * - * Also, this function can nest inside ext3_writepage() -> - * block_write_full_page(). In that case, we *know* that ext3_writepage() - * has generated enough buffer credits to do the whole page. So we won't - * block on the journal in that case, which is good, because the caller may - * be PF_MEMALLOC. - * - * By accident, ext3 can be reentered when a transaction is open via - * quota file writes. If we were to commit the transaction while thus - * reentered, there can be a deadlock - we would be holding a quota - * lock, and the commit would never complete if another thread had a - * transaction open and was blocking on the quota lock - a ranking - * violation. - * - * So what we do is to rely on the fact that journal_stop/journal_start - * will _not_ run commit under these circumstances because handle->h_ref - * is elevated. We'll still have enough credits for the tiny quotafile - * write. - */ -static int do_journal_get_write_access(handle_t *handle, - struct buffer_head *bh) -{ - int dirty = buffer_dirty(bh); - int ret; - - if (!buffer_mapped(bh) || buffer_freed(bh)) - return 0; - /* - * __block_prepare_write() could have dirtied some buffers. Clean - * the dirty bit as jbd2_journal_get_write_access() could complain - * otherwise about fs integrity issues. Setting of the dirty bit - * by __block_prepare_write() isn't a real problem here as we clear - * the bit before releasing a page lock and thus writeback cannot - * ever write the buffer. - */ - if (dirty) - clear_buffer_dirty(bh); - ret = ext3_journal_get_write_access(handle, bh); - if (!ret && dirty) - ret = ext3_journal_dirty_metadata(handle, bh); - return ret; -} - -/* - * Truncate blocks that were not used by write. We have to truncate the - * pagecache as well so that corresponding buffers get properly unmapped. - */ -static void ext3_truncate_failed_write(struct inode *inode) -{ - truncate_inode_pages(inode->i_mapping, inode->i_size); - ext3_truncate(inode); -} - -/* - * Truncate blocks that were not used by direct IO write. We have to zero out - * the last file block as well because direct IO might have written to it. - */ -static void ext3_truncate_failed_direct_write(struct inode *inode) -{ - ext3_block_truncate_page(inode, inode->i_size); - ext3_truncate(inode); -} - -static int ext3_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - struct inode *inode = mapping->host; - int ret; - handle_t *handle; - int retries = 0; - struct page *page; - pgoff_t index; - unsigned from, to; - /* Reserve one block more for addition to orphan list in case - * we allocate blocks but write fails for some reason */ - int needed_blocks = ext3_writepage_trans_blocks(inode) + 1; - - trace_ext3_write_begin(inode, pos, len, flags); - - index = pos >> PAGE_CACHE_SHIFT; - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + len; - -retry: - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; - *pagep = page; - - handle = ext3_journal_start(inode, needed_blocks); - if (IS_ERR(handle)) { - unlock_page(page); - page_cache_release(page); - ret = PTR_ERR(handle); - goto out; - } - ret = __block_write_begin(page, pos, len, ext3_get_block); - if (ret) - goto write_begin_failed; - - if (ext3_should_journal_data(inode)) { - ret = walk_page_buffers(handle, page_buffers(page), - from, to, NULL, do_journal_get_write_access); - } -write_begin_failed: - if (ret) { - /* - * block_write_begin may have instantiated a few blocks - * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. - * - * Add inode to orphan list in case we crash before truncate - * finishes. Do this only if ext3_can_truncate() agrees so - * that orphan processing code is happy. - */ - if (pos + len > inode->i_size && ext3_can_truncate(inode)) - ext3_orphan_add(handle, inode); - ext3_journal_stop(handle); - unlock_page(page); - page_cache_release(page); - if (pos + len > inode->i_size) - ext3_truncate_failed_write(inode); - } - if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) - goto retry; -out: - return ret; -} - - -int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh) -{ - int err = journal_dirty_data(handle, bh); - if (err) - ext3_journal_abort_handle(__func__, __func__, - bh, handle, err); - return err; -} - -/* For ordered writepage and write_end functions */ -static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) -{ - /* - * Write could have mapped the buffer but it didn't copy the data in - * yet. So avoid filing such buffer into a transaction. - */ - if (buffer_mapped(bh) && buffer_uptodate(bh)) - return ext3_journal_dirty_data(handle, bh); - return 0; -} - -/* For write_end() in data=journal mode */ -static int write_end_fn(handle_t *handle, struct buffer_head *bh) -{ - if (!buffer_mapped(bh) || buffer_freed(bh)) - return 0; - set_buffer_uptodate(bh); - return ext3_journal_dirty_metadata(handle, bh); -} - -/* - * This is nasty and subtle: ext3_write_begin() could have allocated blocks - * for the whole page but later we failed to copy the data in. Update inode - * size according to what we managed to copy. The rest is going to be - * truncated in write_end function. - */ -static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied) -{ - /* What matters to us is i_disksize. We don't write i_size anywhere */ - if (pos + copied > inode->i_size) - i_size_write(inode, pos + copied); - if (pos + copied > EXT3_I(inode)->i_disksize) { - EXT3_I(inode)->i_disksize = pos + copied; - mark_inode_dirty(inode); - } -} - -/* - * We need to pick up the new inode size which generic_commit_write gave us - * `file' can be NULL - eg, when called from page_symlink(). - * - * ext3 never places buffers on inode->i_mapping->private_list. metadata - * buffers are managed internally. - */ -static int ext3_ordered_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = file->f_mapping->host; - unsigned from, to; - int ret = 0, ret2; - - trace_ext3_ordered_write_end(inode, pos, len, copied); - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); - - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + copied; - ret = walk_page_buffers(handle, page_buffers(page), - from, to, NULL, journal_dirty_data_fn); - - if (ret == 0) - update_file_sizes(inode, pos, copied); - /* - * There may be allocated blocks outside of i_size because - * we failed to copy some data. Prepare for truncate. - */ - if (pos + len > inode->i_size && ext3_can_truncate(inode)) - ext3_orphan_add(handle, inode); - ret2 = ext3_journal_stop(handle); - if (!ret) - ret = ret2; - unlock_page(page); - page_cache_release(page); - - if (pos + len > inode->i_size) - ext3_truncate_failed_write(inode); - return ret ? ret : copied; -} - -static int ext3_writeback_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = file->f_mapping->host; - int ret; - - trace_ext3_writeback_write_end(inode, pos, len, copied); - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); - update_file_sizes(inode, pos, copied); - /* - * There may be allocated blocks outside of i_size because - * we failed to copy some data. Prepare for truncate. - */ - if (pos + len > inode->i_size && ext3_can_truncate(inode)) - ext3_orphan_add(handle, inode); - ret = ext3_journal_stop(handle); - unlock_page(page); - page_cache_release(page); - - if (pos + len > inode->i_size) - ext3_truncate_failed_write(inode); - return ret ? ret : copied; -} - -static int ext3_journalled_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = mapping->host; - struct ext3_inode_info *ei = EXT3_I(inode); - int ret = 0, ret2; - int partial = 0; - unsigned from, to; - - trace_ext3_journalled_write_end(inode, pos, len, copied); - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + len; - - if (copied < len) { - if (!PageUptodate(page)) - copied = 0; - page_zero_new_buffers(page, from + copied, to); - to = from + copied; - } - - ret = walk_page_buffers(handle, page_buffers(page), from, - to, &partial, write_end_fn); - if (!partial) - SetPageUptodate(page); - - if (pos + copied > inode->i_size) - i_size_write(inode, pos + copied); - /* - * There may be allocated blocks outside of i_size because - * we failed to copy some data. Prepare for truncate. - */ - if (pos + len > inode->i_size && ext3_can_truncate(inode)) - ext3_orphan_add(handle, inode); - ext3_set_inode_state(inode, EXT3_STATE_JDATA); - atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); - if (inode->i_size > ei->i_disksize) { - ei->i_disksize = inode->i_size; - ret2 = ext3_mark_inode_dirty(handle, inode); - if (!ret) - ret = ret2; - } - - ret2 = ext3_journal_stop(handle); - if (!ret) - ret = ret2; - unlock_page(page); - page_cache_release(page); - - if (pos + len > inode->i_size) - ext3_truncate_failed_write(inode); - return ret ? ret : copied; -} - -/* - * bmap() is special. It gets used by applications such as lilo and by - * the swapper to find the on-disk block of a specific piece of data. - * - * Naturally, this is dangerous if the block concerned is still in the - * journal. If somebody makes a swapfile on an ext3 data-journaling - * filesystem and enables swap, then they may get a nasty shock when the - * data getting swapped to that swapfile suddenly gets overwritten by - * the original zero's written out previously to the journal and - * awaiting writeback in the kernel's buffer cache. - * - * So, if we see any bmap calls here on a modified, data-journaled file, - * take extra steps to flush any blocks which might be in the cache. - */ -static sector_t ext3_bmap(struct address_space *mapping, sector_t block) -{ - struct inode *inode = mapping->host; - journal_t *journal; - int err; - - if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) { - /* - * This is a REALLY heavyweight approach, but the use of - * bmap on dirty files is expected to be extremely rare: - * only if we run lilo or swapon on a freshly made file - * do we expect this to happen. - * - * (bmap requires CAP_SYS_RAWIO so this does not - * represent an unprivileged user DOS attack --- we'd be - * in trouble if mortal users could trigger this path at - * will.) - * - * NB. EXT3_STATE_JDATA is not set on files other than - * regular files. If somebody wants to bmap a directory - * or symlink and gets confused because the buffer - * hasn't yet been flushed to disk, they deserve - * everything they get. - */ - - ext3_clear_inode_state(inode, EXT3_STATE_JDATA); - journal = EXT3_JOURNAL(inode); - journal_lock_updates(journal); - err = journal_flush(journal); - journal_unlock_updates(journal); - - if (err) - return 0; - } - - return generic_block_bmap(mapping,block,ext3_get_block); -} - -static int bget_one(handle_t *handle, struct buffer_head *bh) -{ - get_bh(bh); - return 0; -} - -static int bput_one(handle_t *handle, struct buffer_head *bh) -{ - put_bh(bh); - return 0; -} - -static int buffer_unmapped(handle_t *handle, struct buffer_head *bh) -{ - return !buffer_mapped(bh); -} - -/* - * Note that whenever we need to map blocks we start a transaction even if - * we're not journalling data. This is to preserve ordering: any hole - * instantiation within __block_write_full_page -> ext3_get_block() should be - * journalled along with the data so we don't crash and then get metadata which - * refers to old data. - * - * In all journalling modes block_write_full_page() will start the I/O. - * - * We don't honour synchronous mounts for writepage(). That would be - * disastrous. Any write() or metadata operation will sync the fs for - * us. - */ -static int ext3_ordered_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - struct buffer_head *page_bufs; - handle_t *handle = NULL; - int ret = 0; - int err; - - J_ASSERT(PageLocked(page)); - /* - * We don't want to warn for emergency remount. The condition is - * ordered to avoid dereferencing inode->i_sb in non-error case to - * avoid slow-downs. - */ - WARN_ON_ONCE(IS_RDONLY(inode) && - !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); - - /* - * We give up here if we're reentered, because it might be for a - * different filesystem. - */ - if (ext3_journal_current_handle()) - goto out_fail; - - trace_ext3_ordered_writepage(page); - if (!page_has_buffers(page)) { - create_empty_buffers(page, inode->i_sb->s_blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); - page_bufs = page_buffers(page); - } else { - page_bufs = page_buffers(page); - if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE, - NULL, buffer_unmapped)) { - /* Provide NULL get_block() to catch bugs if buffers - * weren't really mapped */ - return block_write_full_page(page, NULL, wbc); - } - } - handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); - - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_fail; - } - - walk_page_buffers(handle, page_bufs, 0, - PAGE_CACHE_SIZE, NULL, bget_one); - - ret = block_write_full_page(page, ext3_get_block, wbc); - - /* - * The page can become unlocked at any point now, and - * truncate can then come in and change things. So we - * can't touch *page from now on. But *page_bufs is - * safe due to elevated refcount. - */ - - /* - * And attach them to the current transaction. But only if - * block_write_full_page() succeeded. Otherwise they are unmapped, - * and generally junk. - */ - if (ret == 0) - ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, - NULL, journal_dirty_data_fn); - walk_page_buffers(handle, page_bufs, 0, - PAGE_CACHE_SIZE, NULL, bput_one); - err = ext3_journal_stop(handle); - if (!ret) - ret = err; - return ret; - -out_fail: - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return ret; -} - -static int ext3_writeback_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - handle_t *handle = NULL; - int ret = 0; - int err; - - J_ASSERT(PageLocked(page)); - /* - * We don't want to warn for emergency remount. The condition is - * ordered to avoid dereferencing inode->i_sb in non-error case to - * avoid slow-downs. - */ - WARN_ON_ONCE(IS_RDONLY(inode) && - !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); - - if (ext3_journal_current_handle()) - goto out_fail; - - trace_ext3_writeback_writepage(page); - if (page_has_buffers(page)) { - if (!walk_page_buffers(NULL, page_buffers(page), 0, - PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { - /* Provide NULL get_block() to catch bugs if buffers - * weren't really mapped */ - return block_write_full_page(page, NULL, wbc); - } - } - - handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_fail; - } - - ret = block_write_full_page(page, ext3_get_block, wbc); - - err = ext3_journal_stop(handle); - if (!ret) - ret = err; - return ret; - -out_fail: - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return ret; -} - -static int ext3_journalled_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - handle_t *handle = NULL; - int ret = 0; - int err; - - J_ASSERT(PageLocked(page)); - /* - * We don't want to warn for emergency remount. The condition is - * ordered to avoid dereferencing inode->i_sb in non-error case to - * avoid slow-downs. - */ - WARN_ON_ONCE(IS_RDONLY(inode) && - !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); - - trace_ext3_journalled_writepage(page); - if (!page_has_buffers(page) || PageChecked(page)) { - if (ext3_journal_current_handle()) - goto no_write; - - handle = ext3_journal_start(inode, - ext3_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto no_write; - } - /* - * It's mmapped pagecache. Add buffers and journal it. There - * doesn't seem much point in redirtying the page here. - */ - ClearPageChecked(page); - ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE, - ext3_get_block); - if (ret != 0) { - ext3_journal_stop(handle); - goto out_unlock; - } - ret = walk_page_buffers(handle, page_buffers(page), 0, - PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); - - err = walk_page_buffers(handle, page_buffers(page), 0, - PAGE_CACHE_SIZE, NULL, write_end_fn); - if (ret == 0) - ret = err; - ext3_set_inode_state(inode, EXT3_STATE_JDATA); - atomic_set(&EXT3_I(inode)->i_datasync_tid, - handle->h_transaction->t_tid); - unlock_page(page); - err = ext3_journal_stop(handle); - if (!ret) - ret = err; - } else { - /* - * It is a page full of checkpoint-mode buffers. Go and write - * them. They should have been already mapped when they went - * to the journal so provide NULL get_block function to catch - * errors. - */ - ret = block_write_full_page(page, NULL, wbc); - } -out: - return ret; - -no_write: - redirty_page_for_writepage(wbc, page); -out_unlock: - unlock_page(page); - goto out; -} - -static int ext3_readpage(struct file *file, struct page *page) -{ - trace_ext3_readpage(page); - return mpage_readpage(page, ext3_get_block); -} - -static int -ext3_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) -{ - return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); -} - -static void ext3_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) -{ - journal_t *journal = EXT3_JOURNAL(page->mapping->host); - - trace_ext3_invalidatepage(page, offset, length); - - /* - * If it's a full truncate we just forget about the pending dirtying - */ - if (offset == 0 && length == PAGE_CACHE_SIZE) - ClearPageChecked(page); - - journal_invalidatepage(journal, page, offset, length); -} - -static int ext3_releasepage(struct page *page, gfp_t wait) -{ - journal_t *journal = EXT3_JOURNAL(page->mapping->host); - - trace_ext3_releasepage(page); - WARN_ON(PageChecked(page)); - if (!page_has_buffers(page)) - return 0; - return journal_try_to_free_buffers(journal, page, wait); -} - -/* - * If the O_DIRECT write will extend the file then add this inode to the - * orphan list. So recovery will truncate it back to the original size - * if the machine crashes during the write. - * - * If the O_DIRECT write is intantiating holes inside i_size and the machine - * crashes then stale disk data _may_ be exposed inside the file. But current - * VFS code falls back into buffered path in that case so we are safe. - */ -static ssize_t ext3_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct ext3_inode_info *ei = EXT3_I(inode); - handle_t *handle; - ssize_t ret; - int orphan = 0; - size_t count = iov_iter_count(iter); - int retries = 0; - - trace_ext3_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); - - if (iov_iter_rw(iter) == WRITE) { - loff_t final_size = offset + count; - - if (final_size > inode->i_size) { - /* Credits for sb + inode write */ - handle = ext3_journal_start(inode, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - ret = ext3_orphan_add(handle, inode); - if (ret) { - ext3_journal_stop(handle); - goto out; - } - orphan = 1; - ei->i_disksize = inode->i_size; - ext3_journal_stop(handle); - } - } - -retry: - ret = blockdev_direct_IO(iocb, inode, iter, offset, ext3_get_block); - /* - * In case of error extending write may have instantiated a few - * blocks outside i_size. Trim these off again. - */ - if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { - loff_t isize = i_size_read(inode); - loff_t end = offset + count; - - if (end > isize) - ext3_truncate_failed_direct_write(inode); - } - if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - - if (orphan) { - int err; - - /* Credits for sb + inode write */ - handle = ext3_journal_start(inode, 2); - if (IS_ERR(handle)) { - /* This is really bad luck. We've written the data - * but cannot extend i_size. Truncate allocated blocks - * and pretend the write failed... */ - ext3_truncate_failed_direct_write(inode); - ret = PTR_ERR(handle); - if (inode->i_nlink) - ext3_orphan_del(NULL, inode); - goto out; - } - if (inode->i_nlink) - ext3_orphan_del(handle, inode); - if (ret > 0) { - loff_t end = offset + ret; - if (end > inode->i_size) { - ei->i_disksize = end; - i_size_write(inode, end); - /* - * We're going to return a positive `ret' - * here due to non-zero-length I/O, so there's - * no way of reporting error returns from - * ext3_mark_inode_dirty() to userspace. So - * ignore it. - */ - ext3_mark_inode_dirty(handle, inode); - } - } - err = ext3_journal_stop(handle); - if (ret == 0) - ret = err; - } -out: - trace_ext3_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); - return ret; -} - -/* - * Pages can be marked dirty completely asynchronously from ext3's journalling - * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do - * much here because ->set_page_dirty is called under VFS locks. The page is - * not necessarily locked. - * - * We cannot just dirty the page and leave attached buffers clean, because the - * buffers' dirty state is "definitive". We cannot just set the buffers dirty - * or jbddirty because all the journalling code will explode. - * - * So what we do is to mark the page "pending dirty" and next time writepage - * is called, propagate that into the buffers appropriately. - */ -static int ext3_journalled_set_page_dirty(struct page *page) -{ - SetPageChecked(page); - return __set_page_dirty_nobuffers(page); -} - -static const struct address_space_operations ext3_ordered_aops = { - .readpage = ext3_readpage, - .readpages = ext3_readpages, - .writepage = ext3_ordered_writepage, - .write_begin = ext3_write_begin, - .write_end = ext3_ordered_write_end, - .bmap = ext3_bmap, - .invalidatepage = ext3_invalidatepage, - .releasepage = ext3_releasepage, - .direct_IO = ext3_direct_IO, - .migratepage = buffer_migrate_page, - .is_partially_uptodate = block_is_partially_uptodate, - .is_dirty_writeback = buffer_check_dirty_writeback, - .error_remove_page = generic_error_remove_page, -}; - -static const struct address_space_operations ext3_writeback_aops = { - .readpage = ext3_readpage, - .readpages = ext3_readpages, - .writepage = ext3_writeback_writepage, - .write_begin = ext3_write_begin, - .write_end = ext3_writeback_write_end, - .bmap = ext3_bmap, - .invalidatepage = ext3_invalidatepage, - .releasepage = ext3_releasepage, - .direct_IO = ext3_direct_IO, - .migratepage = buffer_migrate_page, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, -}; - -static const struct address_space_operations ext3_journalled_aops = { - .readpage = ext3_readpage, - .readpages = ext3_readpages, - .writepage = ext3_journalled_writepage, - .write_begin = ext3_write_begin, - .write_end = ext3_journalled_write_end, - .set_page_dirty = ext3_journalled_set_page_dirty, - .bmap = ext3_bmap, - .invalidatepage = ext3_invalidatepage, - .releasepage = ext3_releasepage, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, -}; - -void ext3_set_aops(struct inode *inode) -{ - if (ext3_should_order_data(inode)) - inode->i_mapping->a_ops = &ext3_ordered_aops; - else if (ext3_should_writeback_data(inode)) - inode->i_mapping->a_ops = &ext3_writeback_aops; - else - inode->i_mapping->a_ops = &ext3_journalled_aops; -} - -/* - * ext3_block_truncate_page() zeroes out a mapping from file offset `from' - * up to the end of the block which corresponds to `from'. - * This required during truncate. We need to physically zero the tail end - * of that block so it doesn't yield old data if the file is later grown. - */ -static int ext3_block_truncate_page(struct inode *inode, loff_t from) -{ - ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE - 1); - unsigned blocksize, iblock, length, pos; - struct page *page; - handle_t *handle = NULL; - struct buffer_head *bh; - int err = 0; - - /* Truncated on block boundary - nothing to do */ - blocksize = inode->i_sb->s_blocksize; - if ((from & (blocksize - 1)) == 0) - return 0; - - page = grab_cache_page(inode->i_mapping, index); - if (!page) - return -ENOMEM; - length = blocksize - (offset & (blocksize - 1)); - iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); - - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - - /* Find the buffer that contains "offset" */ - bh = page_buffers(page); - pos = blocksize; - while (offset >= pos) { - bh = bh->b_this_page; - iblock++; - pos += blocksize; - } - - err = 0; - if (buffer_freed(bh)) { - BUFFER_TRACE(bh, "freed: skip"); - goto unlock; - } - - if (!buffer_mapped(bh)) { - BUFFER_TRACE(bh, "unmapped"); - ext3_get_block(inode, iblock, bh, 0); - /* unmapped? It's a hole - nothing to do */ - if (!buffer_mapped(bh)) { - BUFFER_TRACE(bh, "still unmapped"); - goto unlock; - } - } - - /* Ok, it's mapped. Make sure it's up-to-date */ - if (PageUptodate(page)) - set_buffer_uptodate(bh); - - if (!bh_uptodate_or_lock(bh)) { - err = bh_submit_read(bh); - /* Uhhuh. Read error. Complain and punt. */ - if (err) - goto unlock; - } - - /* data=writeback mode doesn't need transaction to zero-out data */ - if (!ext3_should_writeback_data(inode)) { - /* We journal at most one block */ - handle = ext3_journal_start(inode, 1); - if (IS_ERR(handle)) { - clear_highpage(page); - flush_dcache_page(page); - err = PTR_ERR(handle); - goto unlock; - } - } - - if (ext3_should_journal_data(inode)) { - BUFFER_TRACE(bh, "get write access"); - err = ext3_journal_get_write_access(handle, bh); - if (err) - goto stop; - } - - zero_user(page, offset, length); - BUFFER_TRACE(bh, "zeroed end of block"); - - err = 0; - if (ext3_should_journal_data(inode)) { - err = ext3_journal_dirty_metadata(handle, bh); - } else { - if (ext3_should_order_data(inode)) - err = ext3_journal_dirty_data(handle, bh); - mark_buffer_dirty(bh); - } -stop: - if (handle) - ext3_journal_stop(handle); - -unlock: - unlock_page(page); - page_cache_release(page); - return err; -} - -/* - * Probably it should be a library function... search for first non-zero word - * or memcmp with zero_page, whatever is better for particular architecture. - * Linus? - */ -static inline int all_zeroes(__le32 *p, __le32 *q) -{ - while (p < q) - if (*p++) - return 0; - return 1; -} - -/** - * ext3_find_shared - find the indirect blocks for partial truncation. - * @inode: inode in question - * @depth: depth of the affected branch - * @offsets: offsets of pointers in that branch (see ext3_block_to_path) - * @chain: place to store the pointers to partial indirect blocks - * @top: place to the (detached) top of branch - * - * This is a helper function used by ext3_truncate(). - * - * When we do truncate() we may have to clean the ends of several - * indirect blocks but leave the blocks themselves alive. Block is - * partially truncated if some data below the new i_size is referred - * from it (and it is on the path to the first completely truncated - * data block, indeed). We have to free the top of that path along - * with everything to the right of the path. Since no allocation - * past the truncation point is possible until ext3_truncate() - * finishes, we may safely do the latter, but top of branch may - * require special attention - pageout below the truncation point - * might try to populate it. - * - * We atomically detach the top of branch from the tree, store the - * block number of its root in *@top, pointers to buffer_heads of - * partially truncated blocks - in @chain[].bh and pointers to - * their last elements that should not be removed - in - * @chain[].p. Return value is the pointer to last filled element - * of @chain. - * - * The work left to caller to do the actual freeing of subtrees: - * a) free the subtree starting from *@top - * b) free the subtrees whose roots are stored in - * (@chain[i].p+1 .. end of @chain[i].bh->b_data) - * c) free the subtrees growing from the inode past the @chain[0]. - * (no partially truncated stuff there). */ - -static Indirect *ext3_find_shared(struct inode *inode, int depth, - int offsets[4], Indirect chain[4], __le32 *top) -{ - Indirect *partial, *p; - int k, err; - - *top = 0; - /* Make k index the deepest non-null offset + 1 */ - for (k = depth; k > 1 && !offsets[k-1]; k--) - ; - partial = ext3_get_branch(inode, k, offsets, chain, &err); - /* Writer: pointers */ - if (!partial) - partial = chain + k-1; - /* - * If the branch acquired continuation since we've looked at it - - * fine, it should all survive and (new) top doesn't belong to us. - */ - if (!partial->key && *partial->p) - /* Writer: end */ - goto no_top; - for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--) - ; - /* - * OK, we've found the last block that must survive. The rest of our - * branch should be detached before unlocking. However, if that rest - * of branch is all ours and does not grow immediately from the inode - * it's easier to cheat and just decrement partial->p. - */ - if (p == chain + k - 1 && p > chain) { - p->p--; - } else { - *top = *p->p; - /* Nope, don't do this in ext3. Must leave the tree intact */ -#if 0 - *p->p = 0; -#endif - } - /* Writer: end */ - - while(partial > p) { - brelse(partial->bh); - partial--; - } -no_top: - return partial; -} - -/* - * Zero a number of block pointers in either an inode or an indirect block. - * If we restart the transaction we must again get write access to the - * indirect block for further modification. - * - * We release `count' blocks on disk, but (last - first) may be greater - * than `count' because there can be holes in there. - */ -static void ext3_clear_blocks(handle_t *handle, struct inode *inode, - struct buffer_head *bh, ext3_fsblk_t block_to_free, - unsigned long count, __le32 *first, __le32 *last) -{ - __le32 *p; - if (try_to_extend_transaction(handle, inode)) { - if (bh) { - BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - if (ext3_journal_dirty_metadata(handle, bh)) - return; - } - ext3_mark_inode_dirty(handle, inode); - truncate_restart_transaction(handle, inode); - if (bh) { - BUFFER_TRACE(bh, "retaking write access"); - if (ext3_journal_get_write_access(handle, bh)) - return; - } - } - - /* - * Any buffers which are on the journal will be in memory. We find - * them on the hash table so journal_revoke() will run journal_forget() - * on them. We've already detached each block from the file, so - * bforget() in journal_forget() should be safe. - * - * AKPM: turn on bforget in journal_forget()!!! - */ - for (p = first; p < last; p++) { - u32 nr = le32_to_cpu(*p); - if (nr) { - struct buffer_head *bh; - - *p = 0; - bh = sb_find_get_block(inode->i_sb, nr); - ext3_forget(handle, 0, inode, bh, nr); - } - } - - ext3_free_blocks(handle, inode, block_to_free, count); -} - -/** - * ext3_free_data - free a list of data blocks - * @handle: handle for this transaction - * @inode: inode we are dealing with - * @this_bh: indirect buffer_head which contains *@first and *@last - * @first: array of block numbers - * @last: points immediately past the end of array - * - * We are freeing all blocks referred from that array (numbers are stored as - * little-endian 32-bit) and updating @inode->i_blocks appropriately. - * - * We accumulate contiguous runs of blocks to free. Conveniently, if these - * blocks are contiguous then releasing them at one time will only affect one - * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't - * actually use a lot of journal space. - * - * @this_bh will be %NULL if @first and @last point into the inode's direct - * block pointers. - */ -static void ext3_free_data(handle_t *handle, struct inode *inode, - struct buffer_head *this_bh, - __le32 *first, __le32 *last) -{ - ext3_fsblk_t block_to_free = 0; /* Starting block # of a run */ - unsigned long count = 0; /* Number of blocks in the run */ - __le32 *block_to_free_p = NULL; /* Pointer into inode/ind - corresponding to - block_to_free */ - ext3_fsblk_t nr; /* Current block # */ - __le32 *p; /* Pointer into inode/ind - for current block */ - int err; - - if (this_bh) { /* For indirect block */ - BUFFER_TRACE(this_bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, this_bh); - /* Important: if we can't update the indirect pointers - * to the blocks, we can't free them. */ - if (err) - return; - } - - for (p = first; p < last; p++) { - nr = le32_to_cpu(*p); - if (nr) { - /* accumulate blocks to free if they're contiguous */ - if (count == 0) { - block_to_free = nr; - block_to_free_p = p; - count = 1; - } else if (nr == block_to_free + count) { - count++; - } else { - ext3_clear_blocks(handle, inode, this_bh, - block_to_free, - count, block_to_free_p, p); - block_to_free = nr; - block_to_free_p = p; - count = 1; - } - } - } - - if (count > 0) - ext3_clear_blocks(handle, inode, this_bh, block_to_free, - count, block_to_free_p, p); - - if (this_bh) { - BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata"); - - /* - * The buffer head should have an attached journal head at this - * point. However, if the data is corrupted and an indirect - * block pointed to itself, it would have been detached when - * the block was cleared. Check for this instead of OOPSing. - */ - if (bh2jh(this_bh)) - ext3_journal_dirty_metadata(handle, this_bh); - else - ext3_error(inode->i_sb, "ext3_free_data", - "circular indirect block detected, " - "inode=%lu, block=%llu", - inode->i_ino, - (unsigned long long)this_bh->b_blocknr); - } -} - -/** - * ext3_free_branches - free an array of branches - * @handle: JBD handle for this transaction - * @inode: inode we are dealing with - * @parent_bh: the buffer_head which contains *@first and *@last - * @first: array of block numbers - * @last: pointer immediately past the end of array - * @depth: depth of the branches to free - * - * We are freeing all blocks referred from these branches (numbers are - * stored as little-endian 32-bit) and updating @inode->i_blocks - * appropriately. - */ -static void ext3_free_branches(handle_t *handle, struct inode *inode, - struct buffer_head *parent_bh, - __le32 *first, __le32 *last, int depth) -{ - ext3_fsblk_t nr; - __le32 *p; - - if (is_handle_aborted(handle)) - return; - - if (depth--) { - struct buffer_head *bh; - int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); - p = last; - while (--p >= first) { - nr = le32_to_cpu(*p); - if (!nr) - continue; /* A hole */ - - /* Go read the buffer for the next level down */ - bh = sb_bread(inode->i_sb, nr); - - /* - * A read failure? Report error and clear slot - * (should be rare). - */ - if (!bh) { - ext3_error(inode->i_sb, "ext3_free_branches", - "Read failure, inode=%lu, block="E3FSBLK, - inode->i_ino, nr); - continue; - } - - /* This zaps the entire block. Bottom up. */ - BUFFER_TRACE(bh, "free child branches"); - ext3_free_branches(handle, inode, bh, - (__le32*)bh->b_data, - (__le32*)bh->b_data + addr_per_block, - depth); - - /* - * Everything below this this pointer has been - * released. Now let this top-of-subtree go. - * - * We want the freeing of this indirect block to be - * atomic in the journal with the updating of the - * bitmap block which owns it. So make some room in - * the journal. - * - * We zero the parent pointer *after* freeing its - * pointee in the bitmaps, so if extend_transaction() - * for some reason fails to put the bitmap changes and - * the release into the same transaction, recovery - * will merely complain about releasing a free block, - * rather than leaking blocks. - */ - if (is_handle_aborted(handle)) - return; - if (try_to_extend_transaction(handle, inode)) { - ext3_mark_inode_dirty(handle, inode); - truncate_restart_transaction(handle, inode); - } - - /* - * We've probably journalled the indirect block several - * times during the truncate. But it's no longer - * needed and we now drop it from the transaction via - * journal_revoke(). - * - * That's easy if it's exclusively part of this - * transaction. But if it's part of the committing - * transaction then journal_forget() will simply - * brelse() it. That means that if the underlying - * block is reallocated in ext3_get_block(), - * unmap_underlying_metadata() will find this block - * and will try to get rid of it. damn, damn. Thus - * we don't allow a block to be reallocated until - * a transaction freeing it has fully committed. - * - * We also have to make sure journal replay after a - * crash does not overwrite non-journaled data blocks - * with old metadata when the block got reallocated for - * data. Thus we have to store a revoke record for a - * block in the same transaction in which we free the - * block. - */ - ext3_forget(handle, 1, inode, bh, bh->b_blocknr); - - ext3_free_blocks(handle, inode, nr, 1); - - if (parent_bh) { - /* - * The block which we have just freed is - * pointed to by an indirect block: journal it - */ - BUFFER_TRACE(parent_bh, "get_write_access"); - if (!ext3_journal_get_write_access(handle, - parent_bh)){ - *p = 0; - BUFFER_TRACE(parent_bh, - "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, - parent_bh); - } - } - } - } else { - /* We have reached the bottom of the tree. */ - BUFFER_TRACE(parent_bh, "free data blocks"); - ext3_free_data(handle, inode, parent_bh, first, last); - } -} - -int ext3_can_truncate(struct inode *inode) -{ - if (S_ISREG(inode->i_mode)) - return 1; - if (S_ISDIR(inode->i_mode)) - return 1; - if (S_ISLNK(inode->i_mode)) - return !ext3_inode_is_fast_symlink(inode); - return 0; -} - -/* - * ext3_truncate() - * - * We block out ext3_get_block() block instantiations across the entire - * transaction, and VFS/VM ensures that ext3_truncate() cannot run - * simultaneously on behalf of the same inode. - * - * As we work through the truncate and commit bits of it to the journal there - * is one core, guiding principle: the file's tree must always be consistent on - * disk. We must be able to restart the truncate after a crash. - * - * The file's tree may be transiently inconsistent in memory (although it - * probably isn't), but whenever we close off and commit a journal transaction, - * the contents of (the filesystem + the journal) must be consistent and - * restartable. It's pretty simple, really: bottom up, right to left (although - * left-to-right works OK too). - * - * Note that at recovery time, journal replay occurs *before* the restart of - * truncate against the orphan inode list. - * - * The committed inode has the new, desired i_size (which is the same as - * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see - * that this inode's truncate did not complete and it will again call - * ext3_truncate() to have another go. So there will be instantiated blocks - * to the right of the truncation point in a crashed ext3 filesystem. But - * that's fine - as long as they are linked from the inode, the post-crash - * ext3_truncate() run will find them and release them. - */ -void ext3_truncate(struct inode *inode) -{ - handle_t *handle; - struct ext3_inode_info *ei = EXT3_I(inode); - __le32 *i_data = ei->i_data; - int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); - int offsets[4]; - Indirect chain[4]; - Indirect *partial; - __le32 nr = 0; - int n; - long last_block; - unsigned blocksize = inode->i_sb->s_blocksize; - - trace_ext3_truncate_enter(inode); - - if (!ext3_can_truncate(inode)) - goto out_notrans; - - if (inode->i_size == 0 && ext3_should_writeback_data(inode)) - ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); - - handle = start_transaction(inode); - if (IS_ERR(handle)) - goto out_notrans; - - last_block = (inode->i_size + blocksize-1) - >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); - n = ext3_block_to_path(inode, last_block, offsets, NULL); - if (n == 0) - goto out_stop; /* error */ - - /* - * OK. This truncate is going to happen. We add the inode to the - * orphan list, so that if this truncate spans multiple transactions, - * and we crash, we will resume the truncate when the filesystem - * recovers. It also marks the inode dirty, to catch the new size. - * - * Implication: the file must always be in a sane, consistent - * truncatable state while each transaction commits. - */ - if (ext3_orphan_add(handle, inode)) - goto out_stop; - - /* - * The orphan list entry will now protect us from any crash which - * occurs before the truncate completes, so it is now safe to propagate - * the new, shorter inode size (held for now in i_size) into the - * on-disk inode. We do this via i_disksize, which is the value which - * ext3 *really* writes onto the disk inode. - */ - ei->i_disksize = inode->i_size; - - /* - * From here we block out all ext3_get_block() callers who want to - * modify the block allocation tree. - */ - mutex_lock(&ei->truncate_mutex); - - if (n == 1) { /* direct blocks */ - ext3_free_data(handle, inode, NULL, i_data+offsets[0], - i_data + EXT3_NDIR_BLOCKS); - goto do_indirects; - } - - partial = ext3_find_shared(inode, n, offsets, chain, &nr); - /* Kill the top of shared branch (not detached) */ - if (nr) { - if (partial == chain) { - /* Shared branch grows from the inode */ - ext3_free_branches(handle, inode, NULL, - &nr, &nr+1, (chain+n-1) - partial); - *partial->p = 0; - /* - * We mark the inode dirty prior to restart, - * and prior to stop. No need for it here. - */ - } else { - /* Shared branch grows from an indirect block */ - ext3_free_branches(handle, inode, partial->bh, - partial->p, - partial->p+1, (chain+n-1) - partial); - } - } - /* Clear the ends of indirect blocks on the shared branch */ - while (partial > chain) { - ext3_free_branches(handle, inode, partial->bh, partial->p + 1, - (__le32*)partial->bh->b_data+addr_per_block, - (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse (partial->bh); - partial--; - } -do_indirects: - /* Kill the remaining (whole) subtrees */ - switch (offsets[0]) { - default: - nr = i_data[EXT3_IND_BLOCK]; - if (nr) { - ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 1); - i_data[EXT3_IND_BLOCK] = 0; - } - case EXT3_IND_BLOCK: - nr = i_data[EXT3_DIND_BLOCK]; - if (nr) { - ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 2); - i_data[EXT3_DIND_BLOCK] = 0; - } - case EXT3_DIND_BLOCK: - nr = i_data[EXT3_TIND_BLOCK]; - if (nr) { - ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 3); - i_data[EXT3_TIND_BLOCK] = 0; - } - case EXT3_TIND_BLOCK: - ; - } - - ext3_discard_reservation(inode); - - mutex_unlock(&ei->truncate_mutex); - inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; - ext3_mark_inode_dirty(handle, inode); - - /* - * In a multi-transaction truncate, we only make the final transaction - * synchronous - */ - if (IS_SYNC(inode)) - handle->h_sync = 1; -out_stop: - /* - * If this was a simple ftruncate(), and the file will remain alive - * then we need to clear up the orphan record which we created above. - * However, if this was a real unlink then we were called by - * ext3_evict_inode(), and we allow that function to clean up the - * orphan info for us. - */ - if (inode->i_nlink) - ext3_orphan_del(handle, inode); - - ext3_journal_stop(handle); - trace_ext3_truncate_exit(inode); - return; -out_notrans: - /* - * Delete the inode from orphan list so that it doesn't stay there - * forever and trigger assertion on umount. - */ - if (inode->i_nlink) - ext3_orphan_del(NULL, inode); - trace_ext3_truncate_exit(inode); -} - -static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, - unsigned long ino, struct ext3_iloc *iloc) -{ - unsigned long block_group; - unsigned long offset; - ext3_fsblk_t block; - struct ext3_group_desc *gdp; - - if (!ext3_valid_inum(sb, ino)) { - /* - * This error is already checked for in namei.c unless we are - * looking at an NFS filehandle, in which case no error - * report is needed - */ - return 0; - } - - block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); - gdp = ext3_get_group_desc(sb, block_group, NULL); - if (!gdp) - return 0; - /* - * Figure out the offset within the block group inode table - */ - offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) * - EXT3_INODE_SIZE(sb); - block = le32_to_cpu(gdp->bg_inode_table) + - (offset >> EXT3_BLOCK_SIZE_BITS(sb)); - - iloc->block_group = block_group; - iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1); - return block; -} - -/* - * ext3_get_inode_loc returns with an extra refcount against the inode's - * underlying buffer_head on success. If 'in_mem' is true, we have all - * data in memory that is needed to recreate the on-disk version of this - * inode. - */ -static int __ext3_get_inode_loc(struct inode *inode, - struct ext3_iloc *iloc, int in_mem) -{ - ext3_fsblk_t block; - struct buffer_head *bh; - - block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc); - if (!block) - return -EIO; - - bh = sb_getblk(inode->i_sb, block); - if (unlikely(!bh)) { - ext3_error (inode->i_sb, "ext3_get_inode_loc", - "unable to read inode block - " - "inode=%lu, block="E3FSBLK, - inode->i_ino, block); - return -ENOMEM; - } - if (!buffer_uptodate(bh)) { - lock_buffer(bh); - - /* - * If the buffer has the write error flag, we have failed - * to write out another inode in the same block. In this - * case, we don't have to read the block because we may - * read the old inode data successfully. - */ - if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) - set_buffer_uptodate(bh); - - if (buffer_uptodate(bh)) { - /* someone brought it uptodate while we waited */ - unlock_buffer(bh); - goto has_buffer; - } - - /* - * If we have all information of the inode in memory and this - * is the only valid inode in the block, we need not read the - * block. - */ - if (in_mem) { - struct buffer_head *bitmap_bh; - struct ext3_group_desc *desc; - int inodes_per_buffer; - int inode_offset, i; - int block_group; - int start; - - block_group = (inode->i_ino - 1) / - EXT3_INODES_PER_GROUP(inode->i_sb); - inodes_per_buffer = bh->b_size / - EXT3_INODE_SIZE(inode->i_sb); - inode_offset = ((inode->i_ino - 1) % - EXT3_INODES_PER_GROUP(inode->i_sb)); - start = inode_offset & ~(inodes_per_buffer - 1); - - /* Is the inode bitmap in cache? */ - desc = ext3_get_group_desc(inode->i_sb, - block_group, NULL); - if (!desc) - goto make_io; - - bitmap_bh = sb_getblk(inode->i_sb, - le32_to_cpu(desc->bg_inode_bitmap)); - if (unlikely(!bitmap_bh)) - goto make_io; - - /* - * If the inode bitmap isn't in cache then the - * optimisation may end up performing two reads instead - * of one, so skip it. - */ - if (!buffer_uptodate(bitmap_bh)) { - brelse(bitmap_bh); - goto make_io; - } - for (i = start; i < start + inodes_per_buffer; i++) { - if (i == inode_offset) - continue; - if (ext3_test_bit(i, bitmap_bh->b_data)) - break; - } - brelse(bitmap_bh); - if (i == start + inodes_per_buffer) { - /* all other inodes are free, so skip I/O */ - memset(bh->b_data, 0, bh->b_size); - set_buffer_uptodate(bh); - unlock_buffer(bh); - goto has_buffer; - } - } - -make_io: - /* - * There are other valid inodes in the buffer, this inode - * has in-inode xattrs, or we don't have this inode in memory. - * Read the block from disk. - */ - trace_ext3_load_inode(inode); - get_bh(bh); - bh->b_end_io = end_buffer_read_sync; - submit_bh(READ | REQ_META | REQ_PRIO, bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - ext3_error(inode->i_sb, "ext3_get_inode_loc", - "unable to read inode block - " - "inode=%lu, block="E3FSBLK, - inode->i_ino, block); - brelse(bh); - return -EIO; - } - } -has_buffer: - iloc->bh = bh; - return 0; -} - -int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc) -{ - /* We have all inode data except xattrs in memory here. */ - return __ext3_get_inode_loc(inode, iloc, - !ext3_test_inode_state(inode, EXT3_STATE_XATTR)); -} - -void ext3_set_inode_flags(struct inode *inode) -{ - unsigned int flags = EXT3_I(inode)->i_flags; - - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - if (flags & EXT3_SYNC_FL) - inode->i_flags |= S_SYNC; - if (flags & EXT3_APPEND_FL) - inode->i_flags |= S_APPEND; - if (flags & EXT3_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; - if (flags & EXT3_NOATIME_FL) - inode->i_flags |= S_NOATIME; - if (flags & EXT3_DIRSYNC_FL) - inode->i_flags |= S_DIRSYNC; -} - -/* Propagate flags from i_flags to EXT3_I(inode)->i_flags */ -void ext3_get_inode_flags(struct ext3_inode_info *ei) -{ - unsigned int flags = ei->vfs_inode.i_flags; - - ei->i_flags &= ~(EXT3_SYNC_FL|EXT3_APPEND_FL| - EXT3_IMMUTABLE_FL|EXT3_NOATIME_FL|EXT3_DIRSYNC_FL); - if (flags & S_SYNC) - ei->i_flags |= EXT3_SYNC_FL; - if (flags & S_APPEND) - ei->i_flags |= EXT3_APPEND_FL; - if (flags & S_IMMUTABLE) - ei->i_flags |= EXT3_IMMUTABLE_FL; - if (flags & S_NOATIME) - ei->i_flags |= EXT3_NOATIME_FL; - if (flags & S_DIRSYNC) - ei->i_flags |= EXT3_DIRSYNC_FL; -} - -struct inode *ext3_iget(struct super_block *sb, unsigned long ino) -{ - struct ext3_iloc iloc; - struct ext3_inode *raw_inode; - struct ext3_inode_info *ei; - struct buffer_head *bh; - struct inode *inode; - journal_t *journal = EXT3_SB(sb)->s_journal; - transaction_t *transaction; - long ret; - int block; - uid_t i_uid; - gid_t i_gid; - - inode = iget_locked(sb, ino); - if (!inode) - return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) - return inode; - - ei = EXT3_I(inode); - ei->i_block_alloc_info = NULL; - - ret = __ext3_get_inode_loc(inode, &iloc, 0); - if (ret < 0) - goto bad_inode; - bh = iloc.bh; - raw_inode = ext3_raw_inode(&iloc); - inode->i_mode = le16_to_cpu(raw_inode->i_mode); - i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); - i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); - if(!(test_opt (inode->i_sb, NO_UID32))) { - i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; - i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; - } - i_uid_write(inode, i_uid); - i_gid_write(inode, i_gid); - set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); - inode->i_size = le32_to_cpu(raw_inode->i_size); - inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); - inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); - inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); - inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; - - ei->i_state_flags = 0; - ei->i_dir_start_lookup = 0; - ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); - /* We now have enough fields to check if the inode was active or not. - * This is needed because nfsd might try to access dead inodes - * the test is that same one that e2fsck uses - * NeilBrown 1999oct15 - */ - if (inode->i_nlink == 0) { - if (inode->i_mode == 0 || - !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) { - /* this inode is deleted */ - brelse (bh); - ret = -ESTALE; - goto bad_inode; - } - /* The only unlinked inodes we let through here have - * valid i_mode and are being read by the orphan - * recovery code: that's fine, we're about to complete - * the process of deleting those. */ - } - inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); - ei->i_flags = le32_to_cpu(raw_inode->i_flags); -#ifdef EXT3_FRAGMENTS - ei->i_faddr = le32_to_cpu(raw_inode->i_faddr); - ei->i_frag_no = raw_inode->i_frag; - ei->i_frag_size = raw_inode->i_fsize; -#endif - ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); - if (!S_ISREG(inode->i_mode)) { - ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); - } else { - inode->i_size |= - ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; - } - ei->i_disksize = inode->i_size; - inode->i_generation = le32_to_cpu(raw_inode->i_generation); - ei->i_block_group = iloc.block_group; - /* - * NOTE! The in-memory inode i_data array is in little-endian order - * even on big-endian machines: we do NOT byteswap the block numbers! - */ - for (block = 0; block < EXT3_N_BLOCKS; block++) - ei->i_data[block] = raw_inode->i_block[block]; - INIT_LIST_HEAD(&ei->i_orphan); - - /* - * Set transaction id's of transactions that have to be committed - * to finish f[data]sync. We set them to currently running transaction - * as we cannot be sure that the inode or some of its metadata isn't - * part of the transaction - the inode could have been reclaimed and - * now it is reread from disk. - */ - if (journal) { - tid_t tid; - - spin_lock(&journal->j_state_lock); - if (journal->j_running_transaction) - transaction = journal->j_running_transaction; - else - transaction = journal->j_committing_transaction; - if (transaction) - tid = transaction->t_tid; - else - tid = journal->j_commit_sequence; - spin_unlock(&journal->j_state_lock); - atomic_set(&ei->i_sync_tid, tid); - atomic_set(&ei->i_datasync_tid, tid); - } - - if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 && - EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { - /* - * When mke2fs creates big inodes it does not zero out - * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE, - * so ignore those first few inodes. - */ - ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); - if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > - EXT3_INODE_SIZE(inode->i_sb)) { - brelse (bh); - ret = -EIO; - goto bad_inode; - } - if (ei->i_extra_isize == 0) { - /* The extra space is currently unused. Use it. */ - ei->i_extra_isize = sizeof(struct ext3_inode) - - EXT3_GOOD_OLD_INODE_SIZE; - } else { - __le32 *magic = (void *)raw_inode + - EXT3_GOOD_OLD_INODE_SIZE + - ei->i_extra_isize; - if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC)) - ext3_set_inode_state(inode, EXT3_STATE_XATTR); - } - } else - ei->i_extra_isize = 0; - - if (S_ISREG(inode->i_mode)) { - inode->i_op = &ext3_file_inode_operations; - inode->i_fop = &ext3_file_operations; - ext3_set_aops(inode); - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &ext3_dir_inode_operations; - inode->i_fop = &ext3_dir_operations; - } else if (S_ISLNK(inode->i_mode)) { - if (ext3_inode_is_fast_symlink(inode)) { - inode->i_op = &ext3_fast_symlink_inode_operations; - nd_terminate_link(ei->i_data, inode->i_size, - sizeof(ei->i_data) - 1); - inode->i_link = (char *)ei->i_data; - } else { - inode->i_op = &ext3_symlink_inode_operations; - ext3_set_aops(inode); - } - } else { - inode->i_op = &ext3_special_inode_operations; - if (raw_inode->i_block[0]) - init_special_inode(inode, inode->i_mode, - old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); - else - init_special_inode(inode, inode->i_mode, - new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); - } - brelse (iloc.bh); - ext3_set_inode_flags(inode); - unlock_new_inode(inode); - return inode; - -bad_inode: - iget_failed(inode); - return ERR_PTR(ret); -} - -/* - * Post the struct inode info into an on-disk inode location in the - * buffer-cache. This gobbles the caller's reference to the - * buffer_head in the inode location struct. - * - * The caller must have write access to iloc->bh. - */ -static int ext3_do_update_inode(handle_t *handle, - struct inode *inode, - struct ext3_iloc *iloc) -{ - struct ext3_inode *raw_inode = ext3_raw_inode(iloc); - struct ext3_inode_info *ei = EXT3_I(inode); - struct buffer_head *bh = iloc->bh; - int err = 0, rc, block; - int need_datasync = 0; - __le32 disksize; - uid_t i_uid; - gid_t i_gid; - -again: - /* we can't allow multiple procs in here at once, its a bit racey */ - lock_buffer(bh); - - /* For fields not not tracking in the in-memory inode, - * initialise them to zero for new inodes. */ - if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) - memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); - - ext3_get_inode_flags(ei); - raw_inode->i_mode = cpu_to_le16(inode->i_mode); - i_uid = i_uid_read(inode); - i_gid = i_gid_read(inode); - if(!(test_opt(inode->i_sb, NO_UID32))) { - raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); - raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); -/* - * Fix up interoperability with old kernels. Otherwise, old inodes get - * re-used with the upper 16 bits of the uid/gid intact - */ - if(!ei->i_dtime) { - raw_inode->i_uid_high = - cpu_to_le16(high_16_bits(i_uid)); - raw_inode->i_gid_high = - cpu_to_le16(high_16_bits(i_gid)); - } else { - raw_inode->i_uid_high = 0; - raw_inode->i_gid_high = 0; - } - } else { - raw_inode->i_uid_low = - cpu_to_le16(fs_high2lowuid(i_uid)); - raw_inode->i_gid_low = - cpu_to_le16(fs_high2lowgid(i_gid)); - raw_inode->i_uid_high = 0; - raw_inode->i_gid_high = 0; - } - raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); - disksize = cpu_to_le32(ei->i_disksize); - if (disksize != raw_inode->i_size) { - need_datasync = 1; - raw_inode->i_size = disksize; - } - raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); - raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); - raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); - raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); - raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); - raw_inode->i_flags = cpu_to_le32(ei->i_flags); -#ifdef EXT3_FRAGMENTS - raw_inode->i_faddr = cpu_to_le32(ei->i_faddr); - raw_inode->i_frag = ei->i_frag_no; - raw_inode->i_fsize = ei->i_frag_size; -#endif - raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); - if (!S_ISREG(inode->i_mode)) { - raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); - } else { - disksize = cpu_to_le32(ei->i_disksize >> 32); - if (disksize != raw_inode->i_size_high) { - raw_inode->i_size_high = disksize; - need_datasync = 1; - } - if (ei->i_disksize > 0x7fffffffULL) { - struct super_block *sb = inode->i_sb; - if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, - EXT3_FEATURE_RO_COMPAT_LARGE_FILE) || - EXT3_SB(sb)->s_es->s_rev_level == - cpu_to_le32(EXT3_GOOD_OLD_REV)) { - /* If this is the first large file - * created, add a flag to the superblock. - */ - unlock_buffer(bh); - err = ext3_journal_get_write_access(handle, - EXT3_SB(sb)->s_sbh); - if (err) - goto out_brelse; - - ext3_update_dynamic_rev(sb); - EXT3_SET_RO_COMPAT_FEATURE(sb, - EXT3_FEATURE_RO_COMPAT_LARGE_FILE); - handle->h_sync = 1; - err = ext3_journal_dirty_metadata(handle, - EXT3_SB(sb)->s_sbh); - /* get our lock and start over */ - goto again; - } - } - } - raw_inode->i_generation = cpu_to_le32(inode->i_generation); - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { - if (old_valid_dev(inode->i_rdev)) { - raw_inode->i_block[0] = - cpu_to_le32(old_encode_dev(inode->i_rdev)); - raw_inode->i_block[1] = 0; - } else { - raw_inode->i_block[0] = 0; - raw_inode->i_block[1] = - cpu_to_le32(new_encode_dev(inode->i_rdev)); - raw_inode->i_block[2] = 0; - } - } else for (block = 0; block < EXT3_N_BLOCKS; block++) - raw_inode->i_block[block] = ei->i_data[block]; - - if (ei->i_extra_isize) - raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); - - BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - unlock_buffer(bh); - rc = ext3_journal_dirty_metadata(handle, bh); - if (!err) - err = rc; - ext3_clear_inode_state(inode, EXT3_STATE_NEW); - - atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); - if (need_datasync) - atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); -out_brelse: - brelse (bh); - ext3_std_error(inode->i_sb, err); - return err; -} - -/* - * ext3_write_inode() - * - * We are called from a few places: - * - * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. - * Here, there will be no transaction running. We wait for any running - * transaction to commit. - * - * - Within flush work (for sys_sync(), kupdate and such). - * We wait on commit, if told to. - * - * - Within iput_final() -> write_inode_now() - * We wait on commit, if told to. - * - * In all cases it is actually safe for us to return without doing anything, - * because the inode has been copied into a raw inode buffer in - * ext3_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL - * writeback. - * - * Note that we are absolutely dependent upon all inode dirtiers doing the - * right thing: they *must* call mark_inode_dirty() after dirtying info in - * which we are interested. - * - * It would be a bug for them to not do this. The code: - * - * mark_inode_dirty(inode) - * stuff(); - * inode->i_size = expr; - * - * is in error because write_inode() could occur while `stuff()' is running, - * and the new i_size will be lost. Plus the inode will no longer be on the - * superblock's dirty inode list. - */ -int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) -{ - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) - return 0; - - if (ext3_journal_current_handle()) { - jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); - dump_stack(); - return -EIO; - } - - /* - * No need to force transaction in WB_SYNC_NONE mode. Also - * ext3_sync_fs() will force the commit after everything is - * written. - */ - if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) - return 0; - - return ext3_force_commit(inode->i_sb); -} - -/* - * ext3_setattr() - * - * Called from notify_change. - * - * We want to trap VFS attempts to truncate the file as soon as - * possible. In particular, we want to make sure that when the VFS - * shrinks i_size, we put the inode on the orphan list and modify - * i_disksize immediately, so that during the subsequent flushing of - * dirty pages and freeing of disk blocks, we can guarantee that any - * commit will leave the blocks being flushed in an unused state on - * disk. (On recovery, the inode will get truncated and the blocks will - * be freed, so we have a strong guarantee that no future commit will - * leave these blocks visible to the user.) - * - * Called with inode->sem down. - */ -int ext3_setattr(struct dentry *dentry, struct iattr *attr) -{ - struct inode *inode = d_inode(dentry); - int error, rc = 0; - const unsigned int ia_valid = attr->ia_valid; - - error = inode_change_ok(inode, attr); - if (error) - return error; - - if (is_quota_modification(inode, attr)) - dquot_initialize(inode); - if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || - (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { - handle_t *handle; - - /* (user+group)*(old+new) structure, inode write (sb, - * inode block, ? - but truncate inode update has it) */ - handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ - EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - goto err_out; - } - error = dquot_transfer(inode, attr); - if (error) { - ext3_journal_stop(handle); - return error; - } - /* Update corresponding info in inode so that everything is in - * one transaction */ - if (attr->ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (attr->ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; - error = ext3_mark_inode_dirty(handle, inode); - ext3_journal_stop(handle); - } - - if (attr->ia_valid & ATTR_SIZE) - inode_dio_wait(inode); - - if (S_ISREG(inode->i_mode) && - attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { - handle_t *handle; - - handle = ext3_journal_start(inode, 3); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - goto err_out; - } - - error = ext3_orphan_add(handle, inode); - if (error) { - ext3_journal_stop(handle); - goto err_out; - } - EXT3_I(inode)->i_disksize = attr->ia_size; - error = ext3_mark_inode_dirty(handle, inode); - ext3_journal_stop(handle); - if (error) { - /* Some hard fs error must have happened. Bail out. */ - ext3_orphan_del(NULL, inode); - goto err_out; - } - rc = ext3_block_truncate_page(inode, attr->ia_size); - if (rc) { - /* Cleanup orphan list and exit */ - handle = ext3_journal_start(inode, 3); - if (IS_ERR(handle)) { - ext3_orphan_del(NULL, inode); - goto err_out; - } - ext3_orphan_del(handle, inode); - ext3_journal_stop(handle); - goto err_out; - } - } - - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - truncate_setsize(inode, attr->ia_size); - ext3_truncate(inode); - } - - setattr_copy(inode, attr); - mark_inode_dirty(inode); - - if (ia_valid & ATTR_MODE) - rc = posix_acl_chmod(inode, inode->i_mode); - -err_out: - ext3_std_error(inode->i_sb, error); - if (!error) - error = rc; - return error; -} - - -/* - * How many blocks doth make a writepage()? - * - * With N blocks per page, it may be: - * N data blocks - * 2 indirect block - * 2 dindirect - * 1 tindirect - * N+5 bitmap blocks (from the above) - * N+5 group descriptor summary blocks - * 1 inode block - * 1 superblock. - * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files - * - * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS - * - * With ordered or writeback data it's the same, less the N data blocks. - * - * If the inode's direct blocks can hold an integral number of pages then a - * page cannot straddle two indirect blocks, and we can only touch one indirect - * and dindirect block, and the "5" above becomes "3". - * - * This still overestimates under most circumstances. If we were to pass the - * start and end offsets in here as well we could do block_to_path() on each - * block and work out the exact number of indirects which are touched. Pah. - */ - -static int ext3_writepage_trans_blocks(struct inode *inode) -{ - int bpp = ext3_journal_blocks_per_page(inode); - int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; - int ret; - - if (ext3_should_journal_data(inode)) - ret = 3 * (bpp + indirects) + 2; - else - ret = 2 * (bpp + indirects) + indirects + 2; - -#ifdef CONFIG_QUOTA - /* We know that structure was already allocated during dquot_initialize so - * we will be updating only the data blocks + inodes */ - ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); -#endif - - return ret; -} - -/* - * The caller must have previously called ext3_reserve_inode_write(). - * Give this, we know that the caller already has write access to iloc->bh. - */ -int ext3_mark_iloc_dirty(handle_t *handle, - struct inode *inode, struct ext3_iloc *iloc) -{ - int err = 0; - - /* the do_update_inode consumes one bh->b_count */ - get_bh(iloc->bh); - - /* ext3_do_update_inode() does journal_dirty_metadata */ - err = ext3_do_update_inode(handle, inode, iloc); - put_bh(iloc->bh); - return err; -} - -/* - * On success, We end up with an outstanding reference count against - * iloc->bh. This _must_ be cleaned up later. - */ - -int -ext3_reserve_inode_write(handle_t *handle, struct inode *inode, - struct ext3_iloc *iloc) -{ - int err = 0; - if (handle) { - err = ext3_get_inode_loc(inode, iloc); - if (!err) { - BUFFER_TRACE(iloc->bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, iloc->bh); - if (err) { - brelse(iloc->bh); - iloc->bh = NULL; - } - } - } - ext3_std_error(inode->i_sb, err); - return err; -} - -/* - * What we do here is to mark the in-core inode as clean with respect to inode - * dirtiness (it may still be data-dirty). - * This means that the in-core inode may be reaped by prune_icache - * without having to perform any I/O. This is a very good thing, - * because *any* task may call prune_icache - even ones which - * have a transaction open against a different journal. - * - * Is this cheating? Not really. Sure, we haven't written the - * inode out, but prune_icache isn't a user-visible syncing function. - * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) - * we start and wait on commits. - */ -int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) -{ - struct ext3_iloc iloc; - int err; - - might_sleep(); - trace_ext3_mark_inode_dirty(inode, _RET_IP_); - err = ext3_reserve_inode_write(handle, inode, &iloc); - if (!err) - err = ext3_mark_iloc_dirty(handle, inode, &iloc); - return err; -} - -/* - * ext3_dirty_inode() is called from __mark_inode_dirty() - * - * We're really interested in the case where a file is being extended. - * i_size has been changed by generic_commit_write() and we thus need - * to include the updated inode in the current transaction. - * - * Also, dquot_alloc_space() will always dirty the inode when blocks - * are allocated to the file. - * - * If the inode is marked synchronous, we don't honour that here - doing - * so would cause a commit on atime updates, which we don't bother doing. - * We handle synchronous inodes at the highest possible level. - */ -void ext3_dirty_inode(struct inode *inode, int flags) -{ - handle_t *current_handle = ext3_journal_current_handle(); - handle_t *handle; - - handle = ext3_journal_start(inode, 2); - if (IS_ERR(handle)) - goto out; - if (current_handle && - current_handle->h_transaction != handle->h_transaction) { - /* This task has a transaction open against a different fs */ - printk(KERN_EMERG "%s: transactions do not match!\n", - __func__); - } else { - jbd_debug(5, "marking dirty. outer handle=%p\n", - current_handle); - ext3_mark_inode_dirty(handle, inode); - } - ext3_journal_stop(handle); -out: - return; -} - -#if 0 -/* - * Bind an inode's backing buffer_head into this transaction, to prevent - * it from being flushed to disk early. Unlike - * ext3_reserve_inode_write, this leaves behind no bh reference and - * returns no iloc structure, so the caller needs to repeat the iloc - * lookup to mark the inode dirty later. - */ -static int ext3_pin_inode(handle_t *handle, struct inode *inode) -{ - struct ext3_iloc iloc; - - int err = 0; - if (handle) { - err = ext3_get_inode_loc(inode, &iloc); - if (!err) { - BUFFER_TRACE(iloc.bh, "get_write_access"); - err = journal_get_write_access(handle, iloc.bh); - if (!err) - err = ext3_journal_dirty_metadata(handle, - iloc.bh); - brelse(iloc.bh); - } - } - ext3_std_error(inode->i_sb, err); - return err; -} -#endif - -int ext3_change_inode_journal_flag(struct inode *inode, int val) -{ - journal_t *journal; - handle_t *handle; - int err; - - /* - * We have to be very careful here: changing a data block's - * journaling status dynamically is dangerous. If we write a - * data block to the journal, change the status and then delete - * that block, we risk forgetting to revoke the old log record - * from the journal and so a subsequent replay can corrupt data. - * So, first we make sure that the journal is empty and that - * nobody is changing anything. - */ - - journal = EXT3_JOURNAL(inode); - if (is_journal_aborted(journal)) - return -EROFS; - - journal_lock_updates(journal); - journal_flush(journal); - - /* - * OK, there are no updates running now, and all cached data is - * synced to disk. We are now in a completely consistent state - * which doesn't have anything in the journal, and we know that - * no filesystem updates are running, so it is safe to modify - * the inode's in-core data-journaling state flag now. - */ - - if (val) - EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL; - else - EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL; - ext3_set_aops(inode); - - journal_unlock_updates(journal); - - /* Finally we can mark the inode as dirty. */ - - handle = ext3_journal_start(inode, 1); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - err = ext3_mark_inode_dirty(handle, inode); - handle->h_sync = 1; - ext3_journal_stop(handle); - ext3_std_error(inode->i_sb, err); - - return err; -} diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c deleted file mode 100644 index 4d96e9a64532..000000000000 --- a/fs/ext3/ioctl.c +++ /dev/null @@ -1,327 +0,0 @@ -/* - * linux/fs/ext3/ioctl.c - * - * Copyright (C) 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - */ - -#include <linux/mount.h> -#include <linux/compat.h> -#include <asm/uaccess.h> -#include "ext3.h" - -long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) -{ - struct inode *inode = file_inode(filp); - struct ext3_inode_info *ei = EXT3_I(inode); - unsigned int flags; - unsigned short rsv_window_size; - - ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); - - switch (cmd) { - case EXT3_IOC_GETFLAGS: - ext3_get_inode_flags(ei); - flags = ei->i_flags & EXT3_FL_USER_VISIBLE; - return put_user(flags, (int __user *) arg); - case EXT3_IOC_SETFLAGS: { - handle_t *handle = NULL; - int err; - struct ext3_iloc iloc; - unsigned int oldflags; - unsigned int jflag; - - if (!inode_owner_or_capable(inode)) - return -EACCES; - - if (get_user(flags, (int __user *) arg)) - return -EFAULT; - - err = mnt_want_write_file(filp); - if (err) - return err; - - flags = ext3_mask_flags(inode->i_mode, flags); - - mutex_lock(&inode->i_mutex); - - /* Is it quota file? Do not allow user to mess with it */ - err = -EPERM; - if (IS_NOQUOTA(inode)) - goto flags_out; - - oldflags = ei->i_flags; - - /* The JOURNAL_DATA flag is modifiable only by root */ - jflag = flags & EXT3_JOURNAL_DATA_FL; - - /* - * The IMMUTABLE and APPEND_ONLY flags can only be changed by - * the relevant capability. - * - * This test looks nicer. Thanks to Pauline Middelink - */ - if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) - goto flags_out; - } - - /* - * The JOURNAL_DATA flag can only be changed by - * the relevant capability. - */ - if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) - goto flags_out; - } - - handle = ext3_journal_start(inode, 1); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto flags_out; - } - if (IS_SYNC(inode)) - handle->h_sync = 1; - err = ext3_reserve_inode_write(handle, inode, &iloc); - if (err) - goto flags_err; - - flags = flags & EXT3_FL_USER_MODIFIABLE; - flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE; - ei->i_flags = flags; - - ext3_set_inode_flags(inode); - inode->i_ctime = CURRENT_TIME_SEC; - - err = ext3_mark_iloc_dirty(handle, inode, &iloc); -flags_err: - ext3_journal_stop(handle); - if (err) - goto flags_out; - - if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) - err = ext3_change_inode_journal_flag(inode, jflag); -flags_out: - mutex_unlock(&inode->i_mutex); - mnt_drop_write_file(filp); - return err; - } - case EXT3_IOC_GETVERSION: - case EXT3_IOC_GETVERSION_OLD: - return put_user(inode->i_generation, (int __user *) arg); - case EXT3_IOC_SETVERSION: - case EXT3_IOC_SETVERSION_OLD: { - handle_t *handle; - struct ext3_iloc iloc; - __u32 generation; - int err; - - if (!inode_owner_or_capable(inode)) - return -EPERM; - - err = mnt_want_write_file(filp); - if (err) - return err; - if (get_user(generation, (int __user *) arg)) { - err = -EFAULT; - goto setversion_out; - } - - mutex_lock(&inode->i_mutex); - handle = ext3_journal_start(inode, 1); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto unlock_out; - } - err = ext3_reserve_inode_write(handle, inode, &iloc); - if (err == 0) { - inode->i_ctime = CURRENT_TIME_SEC; - inode->i_generation = generation; - err = ext3_mark_iloc_dirty(handle, inode, &iloc); - } - ext3_journal_stop(handle); - -unlock_out: - mutex_unlock(&inode->i_mutex); -setversion_out: - mnt_drop_write_file(filp); - return err; - } - case EXT3_IOC_GETRSVSZ: - if (test_opt(inode->i_sb, RESERVATION) - && S_ISREG(inode->i_mode) - && ei->i_block_alloc_info) { - rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size; - return put_user(rsv_window_size, (int __user *)arg); - } - return -ENOTTY; - case EXT3_IOC_SETRSVSZ: { - int err; - - if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) - return -ENOTTY; - - err = mnt_want_write_file(filp); - if (err) - return err; - - if (!inode_owner_or_capable(inode)) { - err = -EACCES; - goto setrsvsz_out; - } - - if (get_user(rsv_window_size, (int __user *)arg)) { - err = -EFAULT; - goto setrsvsz_out; - } - - if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS) - rsv_window_size = EXT3_MAX_RESERVE_BLOCKS; - - /* - * need to allocate reservation structure for this inode - * before set the window size - */ - mutex_lock(&ei->truncate_mutex); - if (!ei->i_block_alloc_info) - ext3_init_block_alloc_info(inode); - - if (ei->i_block_alloc_info){ - struct ext3_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node; - rsv->rsv_goal_size = rsv_window_size; - } - mutex_unlock(&ei->truncate_mutex); -setrsvsz_out: - mnt_drop_write_file(filp); - return err; - } - case EXT3_IOC_GROUP_EXTEND: { - ext3_fsblk_t n_blocks_count; - struct super_block *sb = inode->i_sb; - int err, err2; - - if (!capable(CAP_SYS_RESOURCE)) - return -EPERM; - - err = mnt_want_write_file(filp); - if (err) - return err; - - if (get_user(n_blocks_count, (__u32 __user *)arg)) { - err = -EFAULT; - goto group_extend_out; - } - err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count); - journal_lock_updates(EXT3_SB(sb)->s_journal); - err2 = journal_flush(EXT3_SB(sb)->s_journal); - journal_unlock_updates(EXT3_SB(sb)->s_journal); - if (err == 0) - err = err2; -group_extend_out: - mnt_drop_write_file(filp); - return err; - } - case EXT3_IOC_GROUP_ADD: { - struct ext3_new_group_data input; - struct super_block *sb = inode->i_sb; - int err, err2; - - if (!capable(CAP_SYS_RESOURCE)) - return -EPERM; - - err = mnt_want_write_file(filp); - if (err) - return err; - - if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg, - sizeof(input))) { - err = -EFAULT; - goto group_add_out; - } - - err = ext3_group_add(sb, &input); - journal_lock_updates(EXT3_SB(sb)->s_journal); - err2 = journal_flush(EXT3_SB(sb)->s_journal); - journal_unlock_updates(EXT3_SB(sb)->s_journal); - if (err == 0) - err = err2; -group_add_out: - mnt_drop_write_file(filp); - return err; - } - case FITRIM: { - - struct super_block *sb = inode->i_sb; - struct fstrim_range range; - int ret = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&range, (struct fstrim_range __user *)arg, - sizeof(range))) - return -EFAULT; - - ret = ext3_trim_fs(sb, &range); - if (ret < 0) - return ret; - - if (copy_to_user((struct fstrim_range __user *)arg, &range, - sizeof(range))) - return -EFAULT; - - return 0; - } - - default: - return -ENOTTY; - } -} - -#ifdef CONFIG_COMPAT -long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - /* These are just misnamed, they actually get/put from/to user an int */ - switch (cmd) { - case EXT3_IOC32_GETFLAGS: - cmd = EXT3_IOC_GETFLAGS; - break; - case EXT3_IOC32_SETFLAGS: - cmd = EXT3_IOC_SETFLAGS; - break; - case EXT3_IOC32_GETVERSION: - cmd = EXT3_IOC_GETVERSION; - break; - case EXT3_IOC32_SETVERSION: - cmd = EXT3_IOC_SETVERSION; - break; - case EXT3_IOC32_GROUP_EXTEND: - cmd = EXT3_IOC_GROUP_EXTEND; - break; - case EXT3_IOC32_GETVERSION_OLD: - cmd = EXT3_IOC_GETVERSION_OLD; - break; - case EXT3_IOC32_SETVERSION_OLD: - cmd = EXT3_IOC_SETVERSION_OLD; - break; -#ifdef CONFIG_JBD_DEBUG - case EXT3_IOC32_WAIT_FOR_READONLY: - cmd = EXT3_IOC_WAIT_FOR_READONLY; - break; -#endif - case EXT3_IOC32_GETRSVSZ: - cmd = EXT3_IOC_GETRSVSZ; - break; - case EXT3_IOC32_SETRSVSZ: - cmd = EXT3_IOC_SETRSVSZ; - break; - case EXT3_IOC_GROUP_ADD: - break; - default: - return -ENOIOCTLCMD; - } - return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); -} -#endif diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c deleted file mode 100644 index c9e767cd4b67..000000000000 --- a/fs/ext3/namei.c +++ /dev/null @@ -1,2586 +0,0 @@ -/* - * linux/fs/ext3/namei.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/namei.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - * Directory entry file type support and forward compatibility hooks - * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 - * Hash Tree Directory indexing (c) - * Daniel Phillips, 2001 - * Hash Tree Directory indexing porting - * Christopher Li, 2002 - * Hash Tree Directory indexing cleanup - * Theodore Ts'o, 2002 - */ - -#include <linux/quotaops.h> -#include "ext3.h" -#include "namei.h" -#include "xattr.h" -#include "acl.h" - -/* - * define how far ahead to read directories while searching them. - */ -#define NAMEI_RA_CHUNKS 2 -#define NAMEI_RA_BLOCKS 4 -#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) - -static struct buffer_head *ext3_append(handle_t *handle, - struct inode *inode, - u32 *block, int *err) -{ - struct buffer_head *bh; - - *block = inode->i_size >> inode->i_sb->s_blocksize_bits; - - if ((bh = ext3_dir_bread(handle, inode, *block, 1, err))) { - inode->i_size += inode->i_sb->s_blocksize; - EXT3_I(inode)->i_disksize = inode->i_size; - *err = ext3_journal_get_write_access(handle, bh); - if (*err) { - brelse(bh); - bh = NULL; - } - } - return bh; -} - -#ifndef assert -#define assert(test) J_ASSERT(test) -#endif - -#ifdef DX_DEBUG -#define dxtrace(command) command -#else -#define dxtrace(command) -#endif - -struct fake_dirent -{ - __le32 inode; - __le16 rec_len; - u8 name_len; - u8 file_type; -}; - -struct dx_countlimit -{ - __le16 limit; - __le16 count; -}; - -struct dx_entry -{ - __le32 hash; - __le32 block; -}; - -/* - * dx_root_info is laid out so that if it should somehow get overlaid by a - * dirent the two low bits of the hash version will be zero. Therefore, the - * hash version mod 4 should never be 0. Sincerely, the paranoia department. - */ - -struct dx_root -{ - struct fake_dirent dot; - char dot_name[4]; - struct fake_dirent dotdot; - char dotdot_name[4]; - struct dx_root_info - { - __le32 reserved_zero; - u8 hash_version; - u8 info_length; /* 8 */ - u8 indirect_levels; - u8 unused_flags; - } - info; - struct dx_entry entries[0]; -}; - -struct dx_node -{ - struct fake_dirent fake; - struct dx_entry entries[0]; -}; - - -struct dx_frame -{ - struct buffer_head *bh; - struct dx_entry *entries; - struct dx_entry *at; -}; - -struct dx_map_entry -{ - u32 hash; - u16 offs; - u16 size; -}; - -static inline unsigned dx_get_block (struct dx_entry *entry); -static void dx_set_block (struct dx_entry *entry, unsigned value); -static inline unsigned dx_get_hash (struct dx_entry *entry); -static void dx_set_hash (struct dx_entry *entry, unsigned value); -static unsigned dx_get_count (struct dx_entry *entries); -static unsigned dx_get_limit (struct dx_entry *entries); -static void dx_set_count (struct dx_entry *entries, unsigned value); -static void dx_set_limit (struct dx_entry *entries, unsigned value); -static unsigned dx_root_limit (struct inode *dir, unsigned infosize); -static unsigned dx_node_limit (struct inode *dir); -static struct dx_frame *dx_probe(struct qstr *entry, - struct inode *dir, - struct dx_hash_info *hinfo, - struct dx_frame *frame, - int *err); -static void dx_release (struct dx_frame *frames); -static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize, - struct dx_hash_info *hinfo, struct dx_map_entry map[]); -static void dx_sort_map(struct dx_map_entry *map, unsigned count); -static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, - struct dx_map_entry *offsets, int count); -static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize); -static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); -static int ext3_htree_next_block(struct inode *dir, __u32 hash, - struct dx_frame *frame, - struct dx_frame *frames, - __u32 *start_hash); -static struct buffer_head * ext3_dx_find_entry(struct inode *dir, - struct qstr *entry, struct ext3_dir_entry_2 **res_dir, - int *err); -static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode); - -/* - * p is at least 6 bytes before the end of page - */ -static inline struct ext3_dir_entry_2 * -ext3_next_entry(struct ext3_dir_entry_2 *p) -{ - return (struct ext3_dir_entry_2 *)((char *)p + - ext3_rec_len_from_disk(p->rec_len)); -} - -/* - * Future: use high four bits of block for coalesce-on-delete flags - * Mask them off for now. - */ - -static inline unsigned dx_get_block (struct dx_entry *entry) -{ - return le32_to_cpu(entry->block) & 0x00ffffff; -} - -static inline void dx_set_block (struct dx_entry *entry, unsigned value) -{ - entry->block = cpu_to_le32(value); -} - -static inline unsigned dx_get_hash (struct dx_entry *entry) -{ - return le32_to_cpu(entry->hash); -} - -static inline void dx_set_hash (struct dx_entry *entry, unsigned value) -{ - entry->hash = cpu_to_le32(value); -} - -static inline unsigned dx_get_count (struct dx_entry *entries) -{ - return le16_to_cpu(((struct dx_countlimit *) entries)->count); -} - -static inline unsigned dx_get_limit (struct dx_entry *entries) -{ - return le16_to_cpu(((struct dx_countlimit *) entries)->limit); -} - -static inline void dx_set_count (struct dx_entry *entries, unsigned value) -{ - ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); -} - -static inline void dx_set_limit (struct dx_entry *entries, unsigned value) -{ - ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); -} - -static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) -{ - unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - - EXT3_DIR_REC_LEN(2) - infosize; - return entry_space / sizeof(struct dx_entry); -} - -static inline unsigned dx_node_limit (struct inode *dir) -{ - unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); - return entry_space / sizeof(struct dx_entry); -} - -/* - * Debug - */ -#ifdef DX_DEBUG -static void dx_show_index (char * label, struct dx_entry *entries) -{ - int i, n = dx_get_count (entries); - printk("%s index ", label); - for (i = 0; i < n; i++) - { - printk("%x->%u ", i? dx_get_hash(entries + i): 0, dx_get_block(entries + i)); - } - printk("\n"); -} - -struct stats -{ - unsigned names; - unsigned space; - unsigned bcount; -}; - -static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de, - int size, int show_names) -{ - unsigned names = 0, space = 0; - char *base = (char *) de; - struct dx_hash_info h = *hinfo; - - printk("names: "); - while ((char *) de < base + size) - { - if (de->inode) - { - if (show_names) - { - int len = de->name_len; - char *name = de->name; - while (len--) printk("%c", *name++); - ext3fs_dirhash(de->name, de->name_len, &h); - printk(":%x.%u ", h.hash, - (unsigned) ((char *) de - base)); - } - space += EXT3_DIR_REC_LEN(de->name_len); - names++; - } - de = ext3_next_entry(de); - } - printk("(%i)\n", names); - return (struct stats) { names, space, 1 }; -} - -struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, - struct dx_entry *entries, int levels) -{ - unsigned blocksize = dir->i_sb->s_blocksize; - unsigned count = dx_get_count (entries), names = 0, space = 0, i; - unsigned bcount = 0; - struct buffer_head *bh; - int err; - printk("%i indexed blocks...\n", count); - for (i = 0; i < count; i++, entries++) - { - u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0; - u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; - struct stats stats; - printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); - if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue; - stats = levels? - dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): - dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0); - names += stats.names; - space += stats.space; - bcount += stats.bcount; - brelse (bh); - } - if (bcount) - printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", - names, space/bcount,(space/bcount)*100/blocksize); - return (struct stats) { names, space, bcount}; -} -#endif /* DX_DEBUG */ - -/* - * Probe for a directory leaf block to search. - * - * dx_probe can return ERR_BAD_DX_DIR, which means there was a format - * error in the directory index, and the caller should fall back to - * searching the directory normally. The callers of dx_probe **MUST** - * check for this error code, and make sure it never gets reflected - * back to userspace. - */ -static struct dx_frame * -dx_probe(struct qstr *entry, struct inode *dir, - struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) -{ - unsigned count, indirect; - struct dx_entry *at, *entries, *p, *q, *m; - struct dx_root *root; - struct buffer_head *bh; - struct dx_frame *frame = frame_in; - u32 hash; - - frame->bh = NULL; - if (!(bh = ext3_dir_bread(NULL, dir, 0, 0, err))) { - *err = ERR_BAD_DX_DIR; - goto fail; - } - root = (struct dx_root *) bh->b_data; - if (root->info.hash_version != DX_HASH_TEA && - root->info.hash_version != DX_HASH_HALF_MD4 && - root->info.hash_version != DX_HASH_LEGACY) { - ext3_warning(dir->i_sb, __func__, - "Unrecognised inode hash code %d", - root->info.hash_version); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail; - } - hinfo->hash_version = root->info.hash_version; - if (hinfo->hash_version <= DX_HASH_TEA) - hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; - hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; - if (entry) - ext3fs_dirhash(entry->name, entry->len, hinfo); - hash = hinfo->hash; - - if (root->info.unused_flags & 1) { - ext3_warning(dir->i_sb, __func__, - "Unimplemented inode hash flags: %#06x", - root->info.unused_flags); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail; - } - - if ((indirect = root->info.indirect_levels) > 1) { - ext3_warning(dir->i_sb, __func__, - "Unimplemented inode hash depth: %#06x", - root->info.indirect_levels); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail; - } - - entries = (struct dx_entry *) (((char *)&root->info) + - root->info.info_length); - - if (dx_get_limit(entries) != dx_root_limit(dir, - root->info.info_length)) { - ext3_warning(dir->i_sb, __func__, - "dx entry: limit != root limit"); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail; - } - - dxtrace (printk("Look up %x", hash)); - while (1) - { - count = dx_get_count(entries); - if (!count || count > dx_get_limit(entries)) { - ext3_warning(dir->i_sb, __func__, - "dx entry: no count or count > limit"); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail2; - } - - p = entries + 1; - q = entries + count - 1; - while (p <= q) - { - m = p + (q - p)/2; - dxtrace(printk(".")); - if (dx_get_hash(m) > hash) - q = m - 1; - else - p = m + 1; - } - - if (0) // linear search cross check - { - unsigned n = count - 1; - at = entries; - while (n--) - { - dxtrace(printk(",")); - if (dx_get_hash(++at) > hash) - { - at--; - break; - } - } - assert (at == p - 1); - } - - at = p - 1; - dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); - frame->bh = bh; - frame->entries = entries; - frame->at = at; - if (!indirect--) return frame; - if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(at), 0, err))) { - *err = ERR_BAD_DX_DIR; - goto fail2; - } - at = entries = ((struct dx_node *) bh->b_data)->entries; - if (dx_get_limit(entries) != dx_node_limit (dir)) { - ext3_warning(dir->i_sb, __func__, - "dx entry: limit != node limit"); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail2; - } - frame++; - frame->bh = NULL; - } -fail2: - while (frame >= frame_in) { - brelse(frame->bh); - frame--; - } -fail: - if (*err == ERR_BAD_DX_DIR) - ext3_warning(dir->i_sb, __func__, - "Corrupt dir inode %ld, running e2fsck is " - "recommended.", dir->i_ino); - return NULL; -} - -static void dx_release (struct dx_frame *frames) -{ - if (frames[0].bh == NULL) - return; - - if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) - brelse(frames[1].bh); - brelse(frames[0].bh); -} - -/* - * This function increments the frame pointer to search the next leaf - * block, and reads in the necessary intervening nodes if the search - * should be necessary. Whether or not the search is necessary is - * controlled by the hash parameter. If the hash value is even, then - * the search is only continued if the next block starts with that - * hash value. This is used if we are searching for a specific file. - * - * If the hash value is HASH_NB_ALWAYS, then always go to the next block. - * - * This function returns 1 if the caller should continue to search, - * or 0 if it should not. If there is an error reading one of the - * index blocks, it will a negative error code. - * - * If start_hash is non-null, it will be filled in with the starting - * hash of the next page. - */ -static int ext3_htree_next_block(struct inode *dir, __u32 hash, - struct dx_frame *frame, - struct dx_frame *frames, - __u32 *start_hash) -{ - struct dx_frame *p; - struct buffer_head *bh; - int err, num_frames = 0; - __u32 bhash; - - p = frame; - /* - * Find the next leaf page by incrementing the frame pointer. - * If we run out of entries in the interior node, loop around and - * increment pointer in the parent node. When we break out of - * this loop, num_frames indicates the number of interior - * nodes need to be read. - */ - while (1) { - if (++(p->at) < p->entries + dx_get_count(p->entries)) - break; - if (p == frames) - return 0; - num_frames++; - p--; - } - - /* - * If the hash is 1, then continue only if the next page has a - * continuation hash of any value. This is used for readdir - * handling. Otherwise, check to see if the hash matches the - * desired contiuation hash. If it doesn't, return since - * there's no point to read in the successive index pages. - */ - bhash = dx_get_hash(p->at); - if (start_hash) - *start_hash = bhash; - if ((hash & 1) == 0) { - if ((bhash & ~1) != hash) - return 0; - } - /* - * If the hash is HASH_NB_ALWAYS, we always go to the next - * block so no check is necessary - */ - while (num_frames--) { - if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(p->at), - 0, &err))) - return err; /* Failure */ - p++; - brelse (p->bh); - p->bh = bh; - p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; - } - return 1; -} - - -/* - * This function fills a red-black tree with information from a - * directory block. It returns the number directory entries loaded - * into the tree. If there is an error it is returned in err. - */ -static int htree_dirblock_to_tree(struct file *dir_file, - struct inode *dir, int block, - struct dx_hash_info *hinfo, - __u32 start_hash, __u32 start_minor_hash) -{ - struct buffer_head *bh; - struct ext3_dir_entry_2 *de, *top; - int err = 0, count = 0; - - dxtrace(printk("In htree dirblock_to_tree: block %d\n", block)); - - if (!(bh = ext3_dir_bread(NULL, dir, block, 0, &err))) - return err; - - de = (struct ext3_dir_entry_2 *) bh->b_data; - top = (struct ext3_dir_entry_2 *) ((char *) de + - dir->i_sb->s_blocksize - - EXT3_DIR_REC_LEN(0)); - for (; de < top; de = ext3_next_entry(de)) { - if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, - (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb)) - +((char *)de - bh->b_data))) { - /* silently ignore the rest of the block */ - break; - } - ext3fs_dirhash(de->name, de->name_len, hinfo); - if ((hinfo->hash < start_hash) || - ((hinfo->hash == start_hash) && - (hinfo->minor_hash < start_minor_hash))) - continue; - if (de->inode == 0) - continue; - if ((err = ext3_htree_store_dirent(dir_file, - hinfo->hash, hinfo->minor_hash, de)) != 0) { - brelse(bh); - return err; - } - count++; - } - brelse(bh); - return count; -} - - -/* - * This function fills a red-black tree with information from a - * directory. We start scanning the directory in hash order, starting - * at start_hash and start_minor_hash. - * - * This function returns the number of entries inserted into the tree, - * or a negative error code. - */ -int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, - __u32 start_minor_hash, __u32 *next_hash) -{ - struct dx_hash_info hinfo; - struct ext3_dir_entry_2 *de; - struct dx_frame frames[2], *frame; - struct inode *dir; - int block, err; - int count = 0; - int ret; - __u32 hashval; - - dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, - start_minor_hash)); - dir = file_inode(dir_file); - if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { - hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; - if (hinfo.hash_version <= DX_HASH_TEA) - hinfo.hash_version += - EXT3_SB(dir->i_sb)->s_hash_unsigned; - hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; - count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, - start_hash, start_minor_hash); - *next_hash = ~0; - return count; - } - hinfo.hash = start_hash; - hinfo.minor_hash = 0; - frame = dx_probe(NULL, file_inode(dir_file), &hinfo, frames, &err); - if (!frame) - return err; - - /* Add '.' and '..' from the htree header */ - if (!start_hash && !start_minor_hash) { - de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data; - if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0) - goto errout; - count++; - } - if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { - de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data; - de = ext3_next_entry(de); - if ((err = ext3_htree_store_dirent(dir_file, 2, 0, de)) != 0) - goto errout; - count++; - } - - while (1) { - block = dx_get_block(frame->at); - ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, - start_hash, start_minor_hash); - if (ret < 0) { - err = ret; - goto errout; - } - count += ret; - hashval = ~0; - ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, - frame, frames, &hashval); - *next_hash = hashval; - if (ret < 0) { - err = ret; - goto errout; - } - /* - * Stop if: (a) there are no more entries, or - * (b) we have inserted at least one entry and the - * next hash value is not a continuation - */ - if ((ret == 0) || - (count && ((hashval & 1) == 0))) - break; - } - dx_release(frames); - dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", - count, *next_hash)); - return count; -errout: - dx_release(frames); - return (err); -} - - -/* - * Directory block splitting, compacting - */ - -/* - * Create map of hash values, offsets, and sizes, stored at end of block. - * Returns number of entries mapped. - */ -static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize, - struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) -{ - int count = 0; - char *base = (char *) de; - struct dx_hash_info h = *hinfo; - - while ((char *) de < base + blocksize) - { - if (de->name_len && de->inode) { - ext3fs_dirhash(de->name, de->name_len, &h); - map_tail--; - map_tail->hash = h.hash; - map_tail->offs = (u16) ((char *) de - base); - map_tail->size = le16_to_cpu(de->rec_len); - count++; - cond_resched(); - } - /* XXX: do we need to check rec_len == 0 case? -Chris */ - de = ext3_next_entry(de); - } - return count; -} - -/* Sort map by hash value */ -static void dx_sort_map (struct dx_map_entry *map, unsigned count) -{ - struct dx_map_entry *p, *q, *top = map + count - 1; - int more; - /* Combsort until bubble sort doesn't suck */ - while (count > 2) - { - count = count*10/13; - if (count - 9 < 2) /* 9, 10 -> 11 */ - count = 11; - for (p = top, q = p - count; q >= map; p--, q--) - if (p->hash < q->hash) - swap(*p, *q); - } - /* Garden variety bubble sort */ - do { - more = 0; - q = top; - while (q-- > map) - { - if (q[1].hash >= q[0].hash) - continue; - swap(*(q+1), *q); - more = 1; - } - } while(more); -} - -static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) -{ - struct dx_entry *entries = frame->entries; - struct dx_entry *old = frame->at, *new = old + 1; - int count = dx_get_count(entries); - - assert(count < dx_get_limit(entries)); - assert(old < entries + count); - memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); - dx_set_hash(new, hash); - dx_set_block(new, block); - dx_set_count(entries, count + 1); -} - -static void ext3_update_dx_flag(struct inode *inode) -{ - if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb, - EXT3_FEATURE_COMPAT_DIR_INDEX)) - EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; -} - -/* - * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure. - * - * `len <= EXT3_NAME_LEN' is guaranteed by caller. - * `de != NULL' is guaranteed by caller. - */ -static inline int ext3_match (int len, const char * const name, - struct ext3_dir_entry_2 * de) -{ - if (len != de->name_len) - return 0; - if (!de->inode) - return 0; - return !memcmp(name, de->name, len); -} - -/* - * Returns 0 if not found, -1 on failure, and 1 on success - */ -static inline int search_dirblock(struct buffer_head * bh, - struct inode *dir, - struct qstr *child, - unsigned long offset, - struct ext3_dir_entry_2 ** res_dir) -{ - struct ext3_dir_entry_2 * de; - char * dlimit; - int de_len; - const char *name = child->name; - int namelen = child->len; - - de = (struct ext3_dir_entry_2 *) bh->b_data; - dlimit = bh->b_data + dir->i_sb->s_blocksize; - while ((char *) de < dlimit) { - /* this code is executed quadratically often */ - /* do minimal checking `by hand' */ - - if ((char *) de + namelen <= dlimit && - ext3_match (namelen, name, de)) { - /* found a match - just to be sure, do a full check */ - if (!ext3_check_dir_entry("ext3_find_entry", - dir, de, bh, offset)) - return -1; - *res_dir = de; - return 1; - } - /* prevent looping on a bad block */ - de_len = ext3_rec_len_from_disk(de->rec_len); - if (de_len <= 0) - return -1; - offset += de_len; - de = (struct ext3_dir_entry_2 *) ((char *) de + de_len); - } - return 0; -} - - -/* - * ext3_find_entry() - * - * finds an entry in the specified directory with the wanted name. It - * returns the cache buffer in which the entry was found, and the entry - * itself (as a parameter - res_dir). It does NOT read the inode of the - * entry - you'll have to do that yourself if you want to. - * - * The returned buffer_head has ->b_count elevated. The caller is expected - * to brelse() it when appropriate. - */ -static struct buffer_head *ext3_find_entry(struct inode *dir, - struct qstr *entry, - struct ext3_dir_entry_2 **res_dir) -{ - struct super_block * sb; - struct buffer_head * bh_use[NAMEI_RA_SIZE]; - struct buffer_head * bh, *ret = NULL; - unsigned long start, block, b; - const u8 *name = entry->name; - int ra_max = 0; /* Number of bh's in the readahead - buffer, bh_use[] */ - int ra_ptr = 0; /* Current index into readahead - buffer */ - int num = 0; - int nblocks, i, err; - int namelen; - - *res_dir = NULL; - sb = dir->i_sb; - namelen = entry->len; - if (namelen > EXT3_NAME_LEN) - return NULL; - if ((namelen <= 2) && (name[0] == '.') && - (name[1] == '.' || name[1] == 0)) { - /* - * "." or ".." will only be in the first block - * NFS may look up ".."; "." should be handled by the VFS - */ - block = start = 0; - nblocks = 1; - goto restart; - } - if (is_dx(dir)) { - bh = ext3_dx_find_entry(dir, entry, res_dir, &err); - /* - * On success, or if the error was file not found, - * return. Otherwise, fall back to doing a search the - * old fashioned way. - */ - if (bh || (err != ERR_BAD_DX_DIR)) - return bh; - dxtrace(printk("ext3_find_entry: dx failed, falling back\n")); - } - nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); - start = EXT3_I(dir)->i_dir_start_lookup; - if (start >= nblocks) - start = 0; - block = start; -restart: - do { - /* - * We deal with the read-ahead logic here. - */ - if (ra_ptr >= ra_max) { - /* Refill the readahead buffer */ - ra_ptr = 0; - b = block; - for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { - /* - * Terminate if we reach the end of the - * directory and must wrap, or if our - * search has finished at this block. - */ - if (b >= nblocks || (num && block == start)) { - bh_use[ra_max] = NULL; - break; - } - num++; - bh = ext3_getblk(NULL, dir, b++, 0, &err); - bh_use[ra_max] = bh; - if (bh && !bh_uptodate_or_lock(bh)) { - get_bh(bh); - bh->b_end_io = end_buffer_read_sync; - submit_bh(READ | REQ_META | REQ_PRIO, - bh); - } - } - } - if ((bh = bh_use[ra_ptr++]) == NULL) - goto next; - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - /* read error, skip block & hope for the best */ - ext3_error(sb, __func__, "reading directory #%lu " - "offset %lu", dir->i_ino, block); - brelse(bh); - goto next; - } - i = search_dirblock(bh, dir, entry, - block << EXT3_BLOCK_SIZE_BITS(sb), res_dir); - if (i == 1) { - EXT3_I(dir)->i_dir_start_lookup = block; - ret = bh; - goto cleanup_and_exit; - } else { - brelse(bh); - if (i < 0) - goto cleanup_and_exit; - } - next: - if (++block >= nblocks) - block = 0; - } while (block != start); - - /* - * If the directory has grown while we were searching, then - * search the last part of the directory before giving up. - */ - block = nblocks; - nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); - if (block < nblocks) { - start = 0; - goto restart; - } - -cleanup_and_exit: - /* Clean up the read-ahead blocks */ - for (; ra_ptr < ra_max; ra_ptr++) - brelse (bh_use[ra_ptr]); - return ret; -} - -static struct buffer_head * ext3_dx_find_entry(struct inode *dir, - struct qstr *entry, struct ext3_dir_entry_2 **res_dir, - int *err) -{ - struct super_block *sb = dir->i_sb; - struct dx_hash_info hinfo; - struct dx_frame frames[2], *frame; - struct buffer_head *bh; - unsigned long block; - int retval; - - if (!(frame = dx_probe(entry, dir, &hinfo, frames, err))) - return NULL; - do { - block = dx_get_block(frame->at); - if (!(bh = ext3_dir_bread (NULL, dir, block, 0, err))) - goto errout; - - retval = search_dirblock(bh, dir, entry, - block << EXT3_BLOCK_SIZE_BITS(sb), - res_dir); - if (retval == 1) { - dx_release(frames); - return bh; - } - brelse(bh); - if (retval == -1) { - *err = ERR_BAD_DX_DIR; - goto errout; - } - - /* Check to see if we should continue to search */ - retval = ext3_htree_next_block(dir, hinfo.hash, frame, - frames, NULL); - if (retval < 0) { - ext3_warning(sb, __func__, - "error reading index page in directory #%lu", - dir->i_ino); - *err = retval; - goto errout; - } - } while (retval == 1); - - *err = -ENOENT; -errout: - dxtrace(printk("%s not found\n", entry->name)); - dx_release (frames); - return NULL; -} - -static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags) -{ - struct inode * inode; - struct ext3_dir_entry_2 * de; - struct buffer_head * bh; - - if (dentry->d_name.len > EXT3_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); - - bh = ext3_find_entry(dir, &dentry->d_name, &de); - inode = NULL; - if (bh) { - unsigned long ino = le32_to_cpu(de->inode); - brelse (bh); - if (!ext3_valid_inum(dir->i_sb, ino)) { - ext3_error(dir->i_sb, "ext3_lookup", - "bad inode number: %lu", ino); - return ERR_PTR(-EIO); - } - inode = ext3_iget(dir->i_sb, ino); - if (inode == ERR_PTR(-ESTALE)) { - ext3_error(dir->i_sb, __func__, - "deleted inode referenced: %lu", - ino); - return ERR_PTR(-EIO); - } - } - return d_splice_alias(inode, dentry); -} - - -struct dentry *ext3_get_parent(struct dentry *child) -{ - unsigned long ino; - struct qstr dotdot = QSTR_INIT("..", 2); - struct ext3_dir_entry_2 * de; - struct buffer_head *bh; - - bh = ext3_find_entry(d_inode(child), &dotdot, &de); - if (!bh) - return ERR_PTR(-ENOENT); - ino = le32_to_cpu(de->inode); - brelse(bh); - - if (!ext3_valid_inum(d_inode(child)->i_sb, ino)) { - ext3_error(d_inode(child)->i_sb, "ext3_get_parent", - "bad inode number: %lu", ino); - return ERR_PTR(-EIO); - } - - return d_obtain_alias(ext3_iget(d_inode(child)->i_sb, ino)); -} - -#define S_SHIFT 12 -static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = EXT3_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = EXT3_FT_DIR, - [S_IFCHR >> S_SHIFT] = EXT3_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = EXT3_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = EXT3_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = EXT3_FT_SOCK, - [S_IFLNK >> S_SHIFT] = EXT3_FT_SYMLINK, -}; - -static inline void ext3_set_de_type(struct super_block *sb, - struct ext3_dir_entry_2 *de, - umode_t mode) { - if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE)) - de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; -} - -/* - * Move count entries from end of map between two memory locations. - * Returns pointer to last entry moved. - */ -static struct ext3_dir_entry_2 * -dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) -{ - unsigned rec_len = 0; - - while (count--) { - struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); - rec_len = EXT3_DIR_REC_LEN(de->name_len); - memcpy (to, de, rec_len); - ((struct ext3_dir_entry_2 *) to)->rec_len = - ext3_rec_len_to_disk(rec_len); - de->inode = 0; - map++; - to += rec_len; - } - return (struct ext3_dir_entry_2 *) (to - rec_len); -} - -/* - * Compact each dir entry in the range to the minimal rec_len. - * Returns pointer to last entry in range. - */ -static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize) -{ - struct ext3_dir_entry_2 *next, *to, *prev; - struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base; - unsigned rec_len = 0; - - prev = to = de; - while ((char *)de < base + blocksize) { - next = ext3_next_entry(de); - if (de->inode && de->name_len) { - rec_len = EXT3_DIR_REC_LEN(de->name_len); - if (de > to) - memmove(to, de, rec_len); - to->rec_len = ext3_rec_len_to_disk(rec_len); - prev = to; - to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); - } - de = next; - } - return prev; -} - -/* - * Split a full leaf block to make room for a new dir entry. - * Allocate a new block, and move entries so that they are approx. equally full. - * Returns pointer to de in block into which the new entry will be inserted. - */ -static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, - struct buffer_head **bh,struct dx_frame *frame, - struct dx_hash_info *hinfo, int *error) -{ - unsigned blocksize = dir->i_sb->s_blocksize; - unsigned count, continued; - struct buffer_head *bh2; - u32 newblock; - u32 hash2; - struct dx_map_entry *map; - char *data1 = (*bh)->b_data, *data2; - unsigned split, move, size; - struct ext3_dir_entry_2 *de = NULL, *de2; - int err = 0, i; - - bh2 = ext3_append (handle, dir, &newblock, &err); - if (!(bh2)) { - brelse(*bh); - *bh = NULL; - goto errout; - } - - BUFFER_TRACE(*bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, *bh); - if (err) - goto journal_error; - - BUFFER_TRACE(frame->bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, frame->bh); - if (err) - goto journal_error; - - data2 = bh2->b_data; - - /* create map in the end of data2 block */ - map = (struct dx_map_entry *) (data2 + blocksize); - count = dx_make_map ((struct ext3_dir_entry_2 *) data1, - blocksize, hinfo, map); - map -= count; - dx_sort_map (map, count); - /* Split the existing block in the middle, size-wise */ - size = 0; - move = 0; - for (i = count-1; i >= 0; i--) { - /* is more than half of this entry in 2nd half of the block? */ - if (size + map[i].size/2 > blocksize/2) - break; - size += map[i].size; - move++; - } - /* map index at which we will split */ - split = count - move; - hash2 = map[split].hash; - continued = hash2 == map[split - 1].hash; - dxtrace(printk("Split block %i at %x, %i/%i\n", - dx_get_block(frame->at), hash2, split, count-split)); - - /* Fancy dance to stay within two buffers */ - de2 = dx_move_dirents(data1, data2, map + split, count - split); - de = dx_pack_dirents(data1,blocksize); - de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de); - de2->rec_len = ext3_rec_len_to_disk(data2 + blocksize - (char *) de2); - dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); - dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); - - /* Which block gets the new entry? */ - if (hinfo->hash >= hash2) - { - swap(*bh, bh2); - de = de2; - } - dx_insert_block (frame, hash2 + continued, newblock); - err = ext3_journal_dirty_metadata (handle, bh2); - if (err) - goto journal_error; - err = ext3_journal_dirty_metadata (handle, frame->bh); - if (err) - goto journal_error; - brelse (bh2); - dxtrace(dx_show_index ("frame", frame->entries)); - return de; - -journal_error: - brelse(*bh); - brelse(bh2); - *bh = NULL; - ext3_std_error(dir->i_sb, err); -errout: - *error = err; - return NULL; -} - - -/* - * Add a new entry into a directory (leaf) block. If de is non-NULL, - * it points to a directory entry which is guaranteed to be large - * enough for new directory entry. If de is NULL, then - * add_dirent_to_buf will attempt search the directory block for - * space. It will return -ENOSPC if no space is available, and -EIO - * and -EEXIST if directory entry already exists. - * - * NOTE! bh is NOT released in the case where ENOSPC is returned. In - * all other cases bh is released. - */ -static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, - struct inode *inode, struct ext3_dir_entry_2 *de, - struct buffer_head * bh) -{ - struct inode *dir = d_inode(dentry->d_parent); - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; - unsigned long offset = 0; - unsigned short reclen; - int nlen, rlen, err; - char *top; - - reclen = EXT3_DIR_REC_LEN(namelen); - if (!de) { - de = (struct ext3_dir_entry_2 *)bh->b_data; - top = bh->b_data + dir->i_sb->s_blocksize - reclen; - while ((char *) de <= top) { - if (!ext3_check_dir_entry("ext3_add_entry", dir, de, - bh, offset)) { - brelse (bh); - return -EIO; - } - if (ext3_match (namelen, name, de)) { - brelse (bh); - return -EEXIST; - } - nlen = EXT3_DIR_REC_LEN(de->name_len); - rlen = ext3_rec_len_from_disk(de->rec_len); - if ((de->inode? rlen - nlen: rlen) >= reclen) - break; - de = (struct ext3_dir_entry_2 *)((char *)de + rlen); - offset += rlen; - } - if ((char *) de > top) - return -ENOSPC; - } - BUFFER_TRACE(bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, bh); - if (err) { - ext3_std_error(dir->i_sb, err); - brelse(bh); - return err; - } - - /* By now the buffer is marked for journaling */ - nlen = EXT3_DIR_REC_LEN(de->name_len); - rlen = ext3_rec_len_from_disk(de->rec_len); - if (de->inode) { - struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); - de1->rec_len = ext3_rec_len_to_disk(rlen - nlen); - de->rec_len = ext3_rec_len_to_disk(nlen); - de = de1; - } - de->file_type = EXT3_FT_UNKNOWN; - if (inode) { - de->inode = cpu_to_le32(inode->i_ino); - ext3_set_de_type(dir->i_sb, de, inode->i_mode); - } else - de->inode = 0; - de->name_len = namelen; - memcpy (de->name, name, namelen); - /* - * XXX shouldn't update any times until successful - * completion of syscall, but too many callers depend - * on this. - * - * XXX similarly, too many callers depend on - * ext3_new_inode() setting the times, but error - * recovery deletes the inode, so the worst that can - * happen is that the times are slightly out of date - * and/or different from the directory change time. - */ - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; - ext3_update_dx_flag(dir); - dir->i_version++; - ext3_mark_inode_dirty(handle, dir); - BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, bh); - if (err) - ext3_std_error(dir->i_sb, err); - brelse(bh); - return 0; -} - -/* - * This converts a one block unindexed directory to a 3 block indexed - * directory, and adds the dentry to the indexed directory. - */ -static int make_indexed_dir(handle_t *handle, struct dentry *dentry, - struct inode *inode, struct buffer_head *bh) -{ - struct inode *dir = d_inode(dentry->d_parent); - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; - struct buffer_head *bh2; - struct dx_root *root; - struct dx_frame frames[2], *frame; - struct dx_entry *entries; - struct ext3_dir_entry_2 *de, *de2; - char *data1, *top; - unsigned len; - int retval; - unsigned blocksize; - struct dx_hash_info hinfo; - u32 block; - struct fake_dirent *fde; - - blocksize = dir->i_sb->s_blocksize; - dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); - retval = ext3_journal_get_write_access(handle, bh); - if (retval) { - ext3_std_error(dir->i_sb, retval); - brelse(bh); - return retval; - } - root = (struct dx_root *) bh->b_data; - - /* The 0th block becomes the root, move the dirents out */ - fde = &root->dotdot; - de = (struct ext3_dir_entry_2 *)((char *)fde + - ext3_rec_len_from_disk(fde->rec_len)); - if ((char *) de >= (((char *) root) + blocksize)) { - ext3_error(dir->i_sb, __func__, - "invalid rec_len for '..' in inode %lu", - dir->i_ino); - brelse(bh); - return -EIO; - } - len = ((char *) root) + blocksize - (char *) de; - - bh2 = ext3_append (handle, dir, &block, &retval); - if (!(bh2)) { - brelse(bh); - return retval; - } - EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; - data1 = bh2->b_data; - - memcpy (data1, de, len); - de = (struct ext3_dir_entry_2 *) data1; - top = data1 + len; - while ((char *)(de2 = ext3_next_entry(de)) < top) - de = de2; - de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de); - /* Initialize the root; the dot dirents already exist */ - de = (struct ext3_dir_entry_2 *) (&root->dotdot); - de->rec_len = ext3_rec_len_to_disk(blocksize - EXT3_DIR_REC_LEN(2)); - memset (&root->info, 0, sizeof(root->info)); - root->info.info_length = sizeof(root->info); - root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; - entries = root->entries; - dx_set_block (entries, 1); - dx_set_count (entries, 1); - dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); - - /* Initialize as for dx_probe */ - hinfo.hash_version = root->info.hash_version; - if (hinfo.hash_version <= DX_HASH_TEA) - hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; - hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; - ext3fs_dirhash(name, namelen, &hinfo); - frame = frames; - frame->entries = entries; - frame->at = entries; - frame->bh = bh; - bh = bh2; - /* - * Mark buffers dirty here so that if do_split() fails we write a - * consistent set of buffers to disk. - */ - ext3_journal_dirty_metadata(handle, frame->bh); - ext3_journal_dirty_metadata(handle, bh); - de = do_split(handle,dir, &bh, frame, &hinfo, &retval); - if (!de) { - ext3_mark_inode_dirty(handle, dir); - dx_release(frames); - return retval; - } - dx_release(frames); - - return add_dirent_to_buf(handle, dentry, inode, de, bh); -} - -/* - * ext3_add_entry() - * - * adds a file entry to the specified directory, using the same - * semantics as ext3_find_entry(). It returns NULL if it failed. - * - * NOTE!! The inode part of 'de' is left at 0 - which means you - * may not sleep between calling this and putting something into - * the entry, as someone else might have used it while you slept. - */ -static int ext3_add_entry (handle_t *handle, struct dentry *dentry, - struct inode *inode) -{ - struct inode *dir = d_inode(dentry->d_parent); - struct buffer_head * bh; - struct ext3_dir_entry_2 *de; - struct super_block * sb; - int retval; - int dx_fallback=0; - unsigned blocksize; - u32 block, blocks; - - sb = dir->i_sb; - blocksize = sb->s_blocksize; - if (!dentry->d_name.len) - return -EINVAL; - if (is_dx(dir)) { - retval = ext3_dx_add_entry(handle, dentry, inode); - if (!retval || (retval != ERR_BAD_DX_DIR)) - return retval; - EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL; - dx_fallback++; - ext3_mark_inode_dirty(handle, dir); - } - blocks = dir->i_size >> sb->s_blocksize_bits; - for (block = 0; block < blocks; block++) { - if (!(bh = ext3_dir_bread(handle, dir, block, 0, &retval))) - return retval; - - retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (retval != -ENOSPC) - return retval; - - if (blocks == 1 && !dx_fallback && - EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) - return make_indexed_dir(handle, dentry, inode, bh); - brelse(bh); - } - bh = ext3_append(handle, dir, &block, &retval); - if (!bh) - return retval; - de = (struct ext3_dir_entry_2 *) bh->b_data; - de->inode = 0; - de->rec_len = ext3_rec_len_to_disk(blocksize); - return add_dirent_to_buf(handle, dentry, inode, de, bh); -} - -/* - * Returns 0 for success, or a negative error value - */ -static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode) -{ - struct dx_frame frames[2], *frame; - struct dx_entry *entries, *at; - struct dx_hash_info hinfo; - struct buffer_head * bh; - struct inode *dir = d_inode(dentry->d_parent); - struct super_block * sb = dir->i_sb; - struct ext3_dir_entry_2 *de; - int err; - - frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); - if (!frame) - return err; - entries = frame->entries; - at = frame->at; - - if (!(bh = ext3_dir_bread(handle, dir, dx_get_block(frame->at), 0, &err))) - goto cleanup; - - BUFFER_TRACE(bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, bh); - if (err) - goto journal_error; - - err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (err != -ENOSPC) { - bh = NULL; - goto cleanup; - } - - /* Block full, should compress but for now just split */ - dxtrace(printk("using %u of %u node entries\n", - dx_get_count(entries), dx_get_limit(entries))); - /* Need to split index? */ - if (dx_get_count(entries) == dx_get_limit(entries)) { - u32 newblock; - unsigned icount = dx_get_count(entries); - int levels = frame - frames; - struct dx_entry *entries2; - struct dx_node *node2; - struct buffer_head *bh2; - - if (levels && (dx_get_count(frames->entries) == - dx_get_limit(frames->entries))) { - ext3_warning(sb, __func__, - "Directory index full!"); - err = -ENOSPC; - goto cleanup; - } - bh2 = ext3_append (handle, dir, &newblock, &err); - if (!(bh2)) - goto cleanup; - node2 = (struct dx_node *)(bh2->b_data); - entries2 = node2->entries; - memset(&node2->fake, 0, sizeof(struct fake_dirent)); - node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize); - BUFFER_TRACE(frame->bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, frame->bh); - if (err) - goto journal_error; - if (levels) { - unsigned icount1 = icount/2, icount2 = icount - icount1; - unsigned hash2 = dx_get_hash(entries + icount1); - dxtrace(printk("Split index %i/%i\n", icount1, icount2)); - - BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ - err = ext3_journal_get_write_access(handle, - frames[0].bh); - if (err) - goto journal_error; - - memcpy ((char *) entries2, (char *) (entries + icount1), - icount2 * sizeof(struct dx_entry)); - dx_set_count (entries, icount1); - dx_set_count (entries2, icount2); - dx_set_limit (entries2, dx_node_limit(dir)); - - /* Which index block gets the new entry? */ - if (at - entries >= icount1) { - frame->at = at = at - entries - icount1 + entries2; - frame->entries = entries = entries2; - swap(frame->bh, bh2); - } - dx_insert_block (frames + 0, hash2, newblock); - dxtrace(dx_show_index ("node", frames[1].entries)); - dxtrace(dx_show_index ("node", - ((struct dx_node *) bh2->b_data)->entries)); - err = ext3_journal_dirty_metadata(handle, bh2); - if (err) - goto journal_error; - brelse (bh2); - } else { - dxtrace(printk("Creating second level index...\n")); - memcpy((char *) entries2, (char *) entries, - icount * sizeof(struct dx_entry)); - dx_set_limit(entries2, dx_node_limit(dir)); - - /* Set up root */ - dx_set_count(entries, 1); - dx_set_block(entries + 0, newblock); - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; - - /* Add new access path frame */ - frame = frames + 1; - frame->at = at = at - entries + entries2; - frame->entries = entries = entries2; - frame->bh = bh2; - err = ext3_journal_get_write_access(handle, - frame->bh); - if (err) - goto journal_error; - } - err = ext3_journal_dirty_metadata(handle, frames[0].bh); - if (err) - goto journal_error; - } - de = do_split(handle, dir, &bh, frame, &hinfo, &err); - if (!de) - goto cleanup; - err = add_dirent_to_buf(handle, dentry, inode, de, bh); - bh = NULL; - goto cleanup; - -journal_error: - ext3_std_error(dir->i_sb, err); -cleanup: - if (bh) - brelse(bh); - dx_release(frames); - return err; -} - -/* - * ext3_delete_entry deletes a directory entry by merging it with the - * previous entry - */ -static int ext3_delete_entry (handle_t *handle, - struct inode * dir, - struct ext3_dir_entry_2 * de_del, - struct buffer_head * bh) -{ - struct ext3_dir_entry_2 * de, * pde; - int i; - - i = 0; - pde = NULL; - de = (struct ext3_dir_entry_2 *) bh->b_data; - while (i < bh->b_size) { - if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i)) - return -EIO; - if (de == de_del) { - int err; - - BUFFER_TRACE(bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, bh); - if (err) - goto journal_error; - - if (pde) - pde->rec_len = ext3_rec_len_to_disk( - ext3_rec_len_from_disk(pde->rec_len) + - ext3_rec_len_from_disk(de->rec_len)); - else - de->inode = 0; - dir->i_version++; - BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, bh); - if (err) { -journal_error: - ext3_std_error(dir->i_sb, err); - return err; - } - return 0; - } - i += ext3_rec_len_from_disk(de->rec_len); - pde = de; - de = ext3_next_entry(de); - } - return -ENOENT; -} - -static int ext3_add_nondir(handle_t *handle, - struct dentry *dentry, struct inode *inode) -{ - int err = ext3_add_entry(handle, dentry, inode); - if (!err) { - ext3_mark_inode_dirty(handle, inode); - unlock_new_inode(inode); - d_instantiate(dentry, inode); - return 0; - } - drop_nlink(inode); - unlock_new_inode(inode); - iput(inode); - return err; -} - -/* - * By the time this is called, we already have created - * the directory cache entry for the new file, but it - * is so far negative - it has no inode. - * - * If the create succeeds, we fill in the inode information - * with d_instantiate(). - */ -static int ext3_create (struct inode * dir, struct dentry * dentry, umode_t mode, - bool excl) -{ - handle_t *handle; - struct inode * inode; - int err, retries = 0; - - dquot_initialize(dir); - -retry: - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - - inode = ext3_new_inode (handle, dir, &dentry->d_name, mode); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &ext3_file_inode_operations; - inode->i_fop = &ext3_file_operations; - ext3_set_aops(inode); - err = ext3_add_nondir(handle, dentry, inode); - } - ext3_journal_stop(handle); - if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) - goto retry; - return err; -} - -static int ext3_mknod (struct inode * dir, struct dentry *dentry, - umode_t mode, dev_t rdev) -{ - handle_t *handle; - struct inode *inode; - int err, retries = 0; - - if (!new_valid_dev(rdev)) - return -EINVAL; - - dquot_initialize(dir); - -retry: - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - - inode = ext3_new_inode (handle, dir, &dentry->d_name, mode); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - init_special_inode(inode, inode->i_mode, rdev); -#ifdef CONFIG_EXT3_FS_XATTR - inode->i_op = &ext3_special_inode_operations; -#endif - err = ext3_add_nondir(handle, dentry, inode); - } - ext3_journal_stop(handle); - if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) - goto retry; - return err; -} - -static int ext3_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - handle_t *handle; - struct inode *inode; - int err, retries = 0; - - dquot_initialize(dir); - -retry: - handle = ext3_journal_start(dir, EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + - 4 + EXT3_XATTR_TRANS_BLOCKS); - - if (IS_ERR(handle)) - return PTR_ERR(handle); - - inode = ext3_new_inode (handle, dir, NULL, mode); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &ext3_file_inode_operations; - inode->i_fop = &ext3_file_operations; - ext3_set_aops(inode); - d_tmpfile(dentry, inode); - err = ext3_orphan_add(handle, inode); - if (err) - goto err_unlock_inode; - mark_inode_dirty(inode); - unlock_new_inode(inode); - } - ext3_journal_stop(handle); - if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) - goto retry; - return err; -err_unlock_inode: - ext3_journal_stop(handle); - unlock_new_inode(inode); - return err; -} - -static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) -{ - handle_t *handle; - struct inode * inode; - struct buffer_head * dir_block = NULL; - struct ext3_dir_entry_2 * de; - int err, retries = 0; - - if (dir->i_nlink >= EXT3_LINK_MAX) - return -EMLINK; - - dquot_initialize(dir); - -retry: - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - - inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; - - inode->i_op = &ext3_dir_inode_operations; - inode->i_fop = &ext3_dir_operations; - inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; - if (!(dir_block = ext3_dir_bread(handle, inode, 0, 1, &err))) - goto out_clear_inode; - - BUFFER_TRACE(dir_block, "get_write_access"); - err = ext3_journal_get_write_access(handle, dir_block); - if (err) - goto out_clear_inode; - - de = (struct ext3_dir_entry_2 *) dir_block->b_data; - de->inode = cpu_to_le32(inode->i_ino); - de->name_len = 1; - de->rec_len = ext3_rec_len_to_disk(EXT3_DIR_REC_LEN(de->name_len)); - strcpy (de->name, "."); - ext3_set_de_type(dir->i_sb, de, S_IFDIR); - de = ext3_next_entry(de); - de->inode = cpu_to_le32(dir->i_ino); - de->rec_len = ext3_rec_len_to_disk(inode->i_sb->s_blocksize - - EXT3_DIR_REC_LEN(1)); - de->name_len = 2; - strcpy (de->name, ".."); - ext3_set_de_type(dir->i_sb, de, S_IFDIR); - set_nlink(inode, 2); - BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, dir_block); - if (err) - goto out_clear_inode; - - err = ext3_mark_inode_dirty(handle, inode); - if (!err) - err = ext3_add_entry (handle, dentry, inode); - - if (err) { -out_clear_inode: - clear_nlink(inode); - unlock_new_inode(inode); - ext3_mark_inode_dirty(handle, inode); - iput (inode); - goto out_stop; - } - inc_nlink(dir); - ext3_update_dx_flag(dir); - err = ext3_mark_inode_dirty(handle, dir); - if (err) - goto out_clear_inode; - - unlock_new_inode(inode); - d_instantiate(dentry, inode); -out_stop: - brelse(dir_block); - ext3_journal_stop(handle); - if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) - goto retry; - return err; -} - -/* - * routine to check that the specified directory is empty (for rmdir) - */ -static int empty_dir (struct inode * inode) -{ - unsigned long offset; - struct buffer_head * bh; - struct ext3_dir_entry_2 * de, * de1; - struct super_block * sb; - int err = 0; - - sb = inode->i_sb; - if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) || - !(bh = ext3_dir_bread(NULL, inode, 0, 0, &err))) { - if (err) - ext3_error(inode->i_sb, __func__, - "error %d reading directory #%lu offset 0", - err, inode->i_ino); - else - ext3_warning(inode->i_sb, __func__, - "bad directory (dir #%lu) - no data block", - inode->i_ino); - return 1; - } - de = (struct ext3_dir_entry_2 *) bh->b_data; - de1 = ext3_next_entry(de); - if (le32_to_cpu(de->inode) != inode->i_ino || - !le32_to_cpu(de1->inode) || - strcmp (".", de->name) || - strcmp ("..", de1->name)) { - ext3_warning (inode->i_sb, "empty_dir", - "bad directory (dir #%lu) - no `.' or `..'", - inode->i_ino); - brelse (bh); - return 1; - } - offset = ext3_rec_len_from_disk(de->rec_len) + - ext3_rec_len_from_disk(de1->rec_len); - de = ext3_next_entry(de1); - while (offset < inode->i_size ) { - if (!bh || - (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { - err = 0; - brelse (bh); - if (!(bh = ext3_dir_bread (NULL, inode, - offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err))) { - if (err) - ext3_error(sb, __func__, - "error %d reading directory" - " #%lu offset %lu", - err, inode->i_ino, offset); - offset += sb->s_blocksize; - continue; - } - de = (struct ext3_dir_entry_2 *) bh->b_data; - } - if (!ext3_check_dir_entry("empty_dir", inode, de, bh, offset)) { - de = (struct ext3_dir_entry_2 *)(bh->b_data + - sb->s_blocksize); - offset = (offset | (sb->s_blocksize - 1)) + 1; - continue; - } - if (le32_to_cpu(de->inode)) { - brelse (bh); - return 0; - } - offset += ext3_rec_len_from_disk(de->rec_len); - de = ext3_next_entry(de); - } - brelse (bh); - return 1; -} - -/* ext3_orphan_add() links an unlinked or truncated inode into a list of - * such inodes, starting at the superblock, in case we crash before the - * file is closed/deleted, or in case the inode truncate spans multiple - * transactions and the last transaction is not recovered after a crash. - * - * At filesystem recovery time, we walk this list deleting unlinked - * inodes and truncating linked inodes in ext3_orphan_cleanup(). - */ -int ext3_orphan_add(handle_t *handle, struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - struct ext3_iloc iloc; - int err = 0, rc; - - mutex_lock(&EXT3_SB(sb)->s_orphan_lock); - if (!list_empty(&EXT3_I(inode)->i_orphan)) - goto out_unlock; - - /* Orphan handling is only valid for files with data blocks - * being truncated, or files being unlinked. */ - - /* @@@ FIXME: Observation from aviro: - * I think I can trigger J_ASSERT in ext3_orphan_add(). We block - * here (on s_orphan_lock), so race with ext3_link() which might bump - * ->i_nlink. For, say it, character device. Not a regular file, - * not a directory, not a symlink and ->i_nlink > 0. - * - * tytso, 4/25/2009: I'm not sure how that could happen; - * shouldn't the fs core protect us from these sort of - * unlink()/link() races? - */ - J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); - - BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access"); - err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); - if (err) - goto out_unlock; - - err = ext3_reserve_inode_write(handle, inode, &iloc); - if (err) - goto out_unlock; - - /* Insert this inode at the head of the on-disk orphan list... */ - NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan); - EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); - err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); - rc = ext3_mark_iloc_dirty(handle, inode, &iloc); - if (!err) - err = rc; - - /* Only add to the head of the in-memory list if all the - * previous operations succeeded. If the orphan_add is going to - * fail (possibly taking the journal offline), we can't risk - * leaving the inode on the orphan list: stray orphan-list - * entries can cause panics at unmount time. - * - * This is safe: on error we're going to ignore the orphan list - * anyway on the next recovery. */ - if (!err) - list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); - - jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); - jbd_debug(4, "orphan inode %lu will point to %d\n", - inode->i_ino, NEXT_ORPHAN(inode)); -out_unlock: - mutex_unlock(&EXT3_SB(sb)->s_orphan_lock); - ext3_std_error(inode->i_sb, err); - return err; -} - -/* - * ext3_orphan_del() removes an unlinked or truncated inode from the list - * of such inodes stored on disk, because it is finally being cleaned up. - */ -int ext3_orphan_del(handle_t *handle, struct inode *inode) -{ - struct list_head *prev; - struct ext3_inode_info *ei = EXT3_I(inode); - struct ext3_sb_info *sbi; - unsigned long ino_next; - struct ext3_iloc iloc; - int err = 0; - - mutex_lock(&EXT3_SB(inode->i_sb)->s_orphan_lock); - if (list_empty(&ei->i_orphan)) - goto out; - - ino_next = NEXT_ORPHAN(inode); - prev = ei->i_orphan.prev; - sbi = EXT3_SB(inode->i_sb); - - jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); - - list_del_init(&ei->i_orphan); - - /* If we're on an error path, we may not have a valid - * transaction handle with which to update the orphan list on - * disk, but we still need to remove the inode from the linked - * list in memory. */ - if (!handle) - goto out; - - err = ext3_reserve_inode_write(handle, inode, &iloc); - if (err) - goto out_err; - - if (prev == &sbi->s_orphan) { - jbd_debug(4, "superblock will point to %lu\n", ino_next); - BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext3_journal_get_write_access(handle, sbi->s_sbh); - if (err) - goto out_brelse; - sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); - err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); - } else { - struct ext3_iloc iloc2; - struct inode *i_prev = - &list_entry(prev, struct ext3_inode_info, i_orphan)->vfs_inode; - - jbd_debug(4, "orphan inode %lu will point to %lu\n", - i_prev->i_ino, ino_next); - err = ext3_reserve_inode_write(handle, i_prev, &iloc2); - if (err) - goto out_brelse; - NEXT_ORPHAN(i_prev) = ino_next; - err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2); - } - if (err) - goto out_brelse; - NEXT_ORPHAN(inode) = 0; - err = ext3_mark_iloc_dirty(handle, inode, &iloc); - -out_err: - ext3_std_error(inode->i_sb, err); -out: - mutex_unlock(&EXT3_SB(inode->i_sb)->s_orphan_lock); - return err; - -out_brelse: - brelse(iloc.bh); - goto out_err; -} - -static int ext3_rmdir (struct inode * dir, struct dentry *dentry) -{ - int retval; - struct inode * inode; - struct buffer_head * bh; - struct ext3_dir_entry_2 * de; - handle_t *handle; - - /* Initialize quotas before so that eventual writes go in - * separate transaction */ - dquot_initialize(dir); - dquot_initialize(d_inode(dentry)); - - handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - retval = -ENOENT; - bh = ext3_find_entry(dir, &dentry->d_name, &de); - if (!bh) - goto end_rmdir; - - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - - inode = d_inode(dentry); - - retval = -EIO; - if (le32_to_cpu(de->inode) != inode->i_ino) - goto end_rmdir; - - retval = -ENOTEMPTY; - if (!empty_dir (inode)) - goto end_rmdir; - - retval = ext3_delete_entry(handle, dir, de, bh); - if (retval) - goto end_rmdir; - if (inode->i_nlink != 2) - ext3_warning (inode->i_sb, "ext3_rmdir", - "empty directory has nlink!=2 (%d)", - inode->i_nlink); - inode->i_version++; - clear_nlink(inode); - /* There's no need to set i_disksize: the fact that i_nlink is - * zero will ensure that the right thing happens during any - * recovery. */ - inode->i_size = 0; - ext3_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; - ext3_mark_inode_dirty(handle, inode); - drop_nlink(dir); - ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - -end_rmdir: - ext3_journal_stop(handle); - brelse (bh); - return retval; -} - -static int ext3_unlink(struct inode * dir, struct dentry *dentry) -{ - int retval; - struct inode * inode; - struct buffer_head * bh; - struct ext3_dir_entry_2 * de; - handle_t *handle; - - trace_ext3_unlink_enter(dir, dentry); - /* Initialize quotas before so that eventual writes go - * in separate transaction */ - dquot_initialize(dir); - dquot_initialize(d_inode(dentry)); - - handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - - retval = -ENOENT; - bh = ext3_find_entry(dir, &dentry->d_name, &de); - if (!bh) - goto end_unlink; - - inode = d_inode(dentry); - - retval = -EIO; - if (le32_to_cpu(de->inode) != inode->i_ino) - goto end_unlink; - - if (!inode->i_nlink) { - ext3_warning (inode->i_sb, "ext3_unlink", - "Deleting nonexistent file (%lu), %d", - inode->i_ino, inode->i_nlink); - set_nlink(inode, 1); - } - retval = ext3_delete_entry(handle, dir, de, bh); - if (retval) - goto end_unlink; - dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; - ext3_update_dx_flag(dir); - ext3_mark_inode_dirty(handle, dir); - drop_nlink(inode); - if (!inode->i_nlink) - ext3_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime; - ext3_mark_inode_dirty(handle, inode); - retval = 0; - -end_unlink: - ext3_journal_stop(handle); - brelse (bh); - trace_ext3_unlink_exit(dentry, retval); - return retval; -} - -static int ext3_symlink (struct inode * dir, - struct dentry *dentry, const char * symname) -{ - handle_t *handle; - struct inode * inode; - int l, err, retries = 0; - int credits; - - l = strlen(symname)+1; - if (l > dir->i_sb->s_blocksize) - return -ENAMETOOLONG; - - dquot_initialize(dir); - - if (l > EXT3_N_BLOCKS * 4) { - /* - * For non-fast symlinks, we just allocate inode and put it on - * orphan list in the first transaction => we need bitmap, - * group descriptor, sb, inode block, quota blocks, and - * possibly selinux xattr blocks. - */ - credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + - EXT3_XATTR_TRANS_BLOCKS; - } else { - /* - * Fast symlink. We have to add entry to directory - * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS), - * allocate new inode (bitmap, group descriptor, inode block, - * quota blocks, sb is already counted in previous macros). - */ - credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); - } -retry: - handle = ext3_journal_start(dir, credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - - inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; - - if (l > EXT3_N_BLOCKS * 4) { - inode->i_op = &ext3_symlink_inode_operations; - ext3_set_aops(inode); - /* - * We cannot call page_symlink() with transaction started - * because it calls into ext3_write_begin() which acquires page - * lock which ranks below transaction start (and it can also - * wait for journal commit if we are running out of space). So - * we have to stop transaction now and restart it when symlink - * contents is written. - * - * To keep fs consistent in case of crash, we have to put inode - * to orphan list in the mean time. - */ - drop_nlink(inode); - err = ext3_orphan_add(handle, inode); - ext3_journal_stop(handle); - if (err) - goto err_drop_inode; - err = __page_symlink(inode, symname, l, 1); - if (err) - goto err_drop_inode; - /* - * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS - * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified - */ - handle = ext3_journal_start(dir, - EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto err_drop_inode; - } - set_nlink(inode, 1); - err = ext3_orphan_del(handle, inode); - if (err) { - ext3_journal_stop(handle); - drop_nlink(inode); - goto err_drop_inode; - } - } else { - inode->i_op = &ext3_fast_symlink_inode_operations; - inode->i_link = (char*)&EXT3_I(inode)->i_data; - memcpy(inode->i_link, symname, l); - inode->i_size = l-1; - } - EXT3_I(inode)->i_disksize = inode->i_size; - err = ext3_add_nondir(handle, dentry, inode); -out_stop: - ext3_journal_stop(handle); - if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) - goto retry; - return err; -err_drop_inode: - unlock_new_inode(inode); - iput(inode); - return err; -} - -static int ext3_link (struct dentry * old_dentry, - struct inode * dir, struct dentry *dentry) -{ - handle_t *handle; - struct inode *inode = d_inode(old_dentry); - int err, retries = 0; - - if (inode->i_nlink >= EXT3_LINK_MAX) - return -EMLINK; - - dquot_initialize(dir); - -retry: - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - - inode->i_ctime = CURRENT_TIME_SEC; - inc_nlink(inode); - ihold(inode); - - err = ext3_add_entry(handle, dentry, inode); - if (!err) { - ext3_mark_inode_dirty(handle, inode); - /* this can happen only for tmpfile being - * linked the first time - */ - if (inode->i_nlink == 1) - ext3_orphan_del(handle, inode); - d_instantiate(dentry, inode); - } else { - drop_nlink(inode); - iput(inode); - } - ext3_journal_stop(handle); - if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) - goto retry; - return err; -} - -#define PARENT_INO(buffer) \ - (ext3_next_entry((struct ext3_dir_entry_2 *)(buffer))->inode) - -/* - * Anybody can rename anything with this: the permission checks are left to the - * higher-level routines. - */ -static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, - struct inode * new_dir,struct dentry *new_dentry) -{ - handle_t *handle; - struct inode * old_inode, * new_inode; - struct buffer_head * old_bh, * new_bh, * dir_bh; - struct ext3_dir_entry_2 * old_de, * new_de; - int retval, flush_file = 0; - - dquot_initialize(old_dir); - dquot_initialize(new_dir); - - old_bh = new_bh = dir_bh = NULL; - - /* Initialize quotas before so that eventual writes go - * in separate transaction */ - if (d_really_is_positive(new_dentry)) - dquot_initialize(d_inode(new_dentry)); - handle = ext3_journal_start(old_dir, 2 * - EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) - handle->h_sync = 1; - - old_bh = ext3_find_entry(old_dir, &old_dentry->d_name, &old_de); - /* - * Check for inode number is _not_ due to possible IO errors. - * We might rmdir the source, keep it as pwd of some process - * and merrily kill the link to whatever was created under the - * same name. Goodbye sticky bit ;-< - */ - old_inode = d_inode(old_dentry); - retval = -ENOENT; - if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) - goto end_rename; - - new_inode = d_inode(new_dentry); - new_bh = ext3_find_entry(new_dir, &new_dentry->d_name, &new_de); - if (new_bh) { - if (!new_inode) { - brelse (new_bh); - new_bh = NULL; - } - } - if (S_ISDIR(old_inode->i_mode)) { - if (new_inode) { - retval = -ENOTEMPTY; - if (!empty_dir (new_inode)) - goto end_rename; - } - retval = -EIO; - dir_bh = ext3_dir_bread(handle, old_inode, 0, 0, &retval); - if (!dir_bh) - goto end_rename; - if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) - goto end_rename; - retval = -EMLINK; - if (!new_inode && new_dir!=old_dir && - new_dir->i_nlink >= EXT3_LINK_MAX) - goto end_rename; - } - if (!new_bh) { - retval = ext3_add_entry (handle, new_dentry, old_inode); - if (retval) - goto end_rename; - } else { - BUFFER_TRACE(new_bh, "get write access"); - retval = ext3_journal_get_write_access(handle, new_bh); - if (retval) - goto journal_error; - new_de->inode = cpu_to_le32(old_inode->i_ino); - if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb, - EXT3_FEATURE_INCOMPAT_FILETYPE)) - new_de->file_type = old_de->file_type; - new_dir->i_version++; - new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC; - ext3_mark_inode_dirty(handle, new_dir); - BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata"); - retval = ext3_journal_dirty_metadata(handle, new_bh); - if (retval) - goto journal_error; - brelse(new_bh); - new_bh = NULL; - } - - /* - * Like most other Unix systems, set the ctime for inodes on a - * rename. - */ - old_inode->i_ctime = CURRENT_TIME_SEC; - ext3_mark_inode_dirty(handle, old_inode); - - /* - * ok, that's it - */ - if (le32_to_cpu(old_de->inode) != old_inode->i_ino || - old_de->name_len != old_dentry->d_name.len || - strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || - (retval = ext3_delete_entry(handle, old_dir, - old_de, old_bh)) == -ENOENT) { - /* old_de could have moved from under us during htree split, so - * make sure that we are deleting the right entry. We might - * also be pointing to a stale entry in the unused part of - * old_bh so just checking inum and the name isn't enough. */ - struct buffer_head *old_bh2; - struct ext3_dir_entry_2 *old_de2; - - old_bh2 = ext3_find_entry(old_dir, &old_dentry->d_name, - &old_de2); - if (old_bh2) { - retval = ext3_delete_entry(handle, old_dir, - old_de2, old_bh2); - brelse(old_bh2); - } - } - if (retval) { - ext3_warning(old_dir->i_sb, "ext3_rename", - "Deleting old file (%lu), %d, error=%d", - old_dir->i_ino, old_dir->i_nlink, retval); - } - - if (new_inode) { - drop_nlink(new_inode); - new_inode->i_ctime = CURRENT_TIME_SEC; - } - old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; - ext3_update_dx_flag(old_dir); - if (dir_bh) { - BUFFER_TRACE(dir_bh, "get_write_access"); - retval = ext3_journal_get_write_access(handle, dir_bh); - if (retval) - goto journal_error; - PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); - BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); - retval = ext3_journal_dirty_metadata(handle, dir_bh); - if (retval) { -journal_error: - ext3_std_error(new_dir->i_sb, retval); - goto end_rename; - } - drop_nlink(old_dir); - if (new_inode) { - drop_nlink(new_inode); - } else { - inc_nlink(new_dir); - ext3_update_dx_flag(new_dir); - ext3_mark_inode_dirty(handle, new_dir); - } - } - ext3_mark_inode_dirty(handle, old_dir); - if (new_inode) { - ext3_mark_inode_dirty(handle, new_inode); - if (!new_inode->i_nlink) - ext3_orphan_add(handle, new_inode); - if (ext3_should_writeback_data(new_inode)) - flush_file = 1; - } - retval = 0; - -end_rename: - brelse (dir_bh); - brelse (old_bh); - brelse (new_bh); - ext3_journal_stop(handle); - if (retval == 0 && flush_file) - filemap_flush(old_inode->i_mapping); - return retval; -} - -/* - * directories can handle most operations... - */ -const struct inode_operations ext3_dir_inode_operations = { - .create = ext3_create, - .lookup = ext3_lookup, - .link = ext3_link, - .unlink = ext3_unlink, - .symlink = ext3_symlink, - .mkdir = ext3_mkdir, - .rmdir = ext3_rmdir, - .mknod = ext3_mknod, - .tmpfile = ext3_tmpfile, - .rename = ext3_rename, - .setattr = ext3_setattr, -#ifdef CONFIG_EXT3_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .listxattr = ext3_listxattr, - .removexattr = generic_removexattr, -#endif - .get_acl = ext3_get_acl, - .set_acl = ext3_set_acl, -}; - -const struct inode_operations ext3_special_inode_operations = { - .setattr = ext3_setattr, -#ifdef CONFIG_EXT3_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .listxattr = ext3_listxattr, - .removexattr = generic_removexattr, -#endif - .get_acl = ext3_get_acl, - .set_acl = ext3_set_acl, -}; diff --git a/fs/ext3/namei.h b/fs/ext3/namei.h deleted file mode 100644 index 46304d8c9f0a..000000000000 --- a/fs/ext3/namei.h +++ /dev/null @@ -1,27 +0,0 @@ -/* linux/fs/ext3/namei.h - * - * Copyright (C) 2005 Simtec Electronics - * Ben Dooks <ben@simtec.co.uk> - * -*/ - -extern struct dentry *ext3_get_parent(struct dentry *child); - -static inline struct buffer_head *ext3_dir_bread(handle_t *handle, - struct inode *inode, - int block, int create, - int *err) -{ - struct buffer_head *bh; - - bh = ext3_bread(handle, inode, block, create, err); - - if (!bh && !(*err)) { - *err = -EIO; - ext3_error(inode->i_sb, __func__, - "Directory hole detected on inode %lu\n", - inode->i_ino); - return NULL; - } - return bh; -} diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c deleted file mode 100644 index 27105655502c..000000000000 --- a/fs/ext3/resize.c +++ /dev/null @@ -1,1117 +0,0 @@ -/* - * linux/fs/ext3/resize.c - * - * Support for resizing an ext3 filesystem while it is mounted. - * - * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com> - * - * This could probably be made into a module, because it is not often in use. - */ - - -#define EXT3FS_DEBUG - -#include "ext3.h" - - -#define outside(b, first, last) ((b) < (first) || (b) >= (last)) -#define inside(b, first, last) ((b) >= (first) && (b) < (last)) - -static int verify_group_input(struct super_block *sb, - struct ext3_new_group_data *input) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - struct ext3_super_block *es = sbi->s_es; - ext3_fsblk_t start = le32_to_cpu(es->s_blocks_count); - ext3_fsblk_t end = start + input->blocks_count; - unsigned group = input->group; - ext3_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; - unsigned overhead = ext3_bg_has_super(sb, group) ? - (1 + ext3_bg_num_gdb(sb, group) + - le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; - ext3_fsblk_t metaend = start + overhead; - struct buffer_head *bh = NULL; - ext3_grpblk_t free_blocks_count; - int err = -EINVAL; - - input->free_blocks_count = free_blocks_count = - input->blocks_count - 2 - overhead - sbi->s_itb_per_group; - - if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT3-fs: adding %s group %u: %u blocks " - "(%d free, %u reserved)\n", - ext3_bg_has_super(sb, input->group) ? "normal" : - "no-super", input->group, input->blocks_count, - free_blocks_count, input->reserved_blocks); - - if (group != sbi->s_groups_count) - ext3_warning(sb, __func__, - "Cannot add at group %u (only %lu groups)", - input->group, sbi->s_groups_count); - else if ((start - le32_to_cpu(es->s_first_data_block)) % - EXT3_BLOCKS_PER_GROUP(sb)) - ext3_warning(sb, __func__, "Last group not full"); - else if (input->reserved_blocks > input->blocks_count / 5) - ext3_warning(sb, __func__, "Reserved blocks too high (%u)", - input->reserved_blocks); - else if (free_blocks_count < 0) - ext3_warning(sb, __func__, "Bad blocks count %u", - input->blocks_count); - else if (!(bh = sb_bread(sb, end - 1))) - ext3_warning(sb, __func__, - "Cannot read last block ("E3FSBLK")", - end - 1); - else if (outside(input->block_bitmap, start, end)) - ext3_warning(sb, __func__, - "Block bitmap not in group (block %u)", - input->block_bitmap); - else if (outside(input->inode_bitmap, start, end)) - ext3_warning(sb, __func__, - "Inode bitmap not in group (block %u)", - input->inode_bitmap); - else if (outside(input->inode_table, start, end) || - outside(itend - 1, start, end)) - ext3_warning(sb, __func__, - "Inode table not in group (blocks %u-"E3FSBLK")", - input->inode_table, itend - 1); - else if (input->inode_bitmap == input->block_bitmap) - ext3_warning(sb, __func__, - "Block bitmap same as inode bitmap (%u)", - input->block_bitmap); - else if (inside(input->block_bitmap, input->inode_table, itend)) - ext3_warning(sb, __func__, - "Block bitmap (%u) in inode table (%u-"E3FSBLK")", - input->block_bitmap, input->inode_table, itend-1); - else if (inside(input->inode_bitmap, input->inode_table, itend)) - ext3_warning(sb, __func__, - "Inode bitmap (%u) in inode table (%u-"E3FSBLK")", - input->inode_bitmap, input->inode_table, itend-1); - else if (inside(input->block_bitmap, start, metaend)) - ext3_warning(sb, __func__, - "Block bitmap (%u) in GDT table" - " ("E3FSBLK"-"E3FSBLK")", - input->block_bitmap, start, metaend - 1); - else if (inside(input->inode_bitmap, start, metaend)) - ext3_warning(sb, __func__, - "Inode bitmap (%u) in GDT table" - " ("E3FSBLK"-"E3FSBLK")", - input->inode_bitmap, start, metaend - 1); - else if (inside(input->inode_table, start, metaend) || - inside(itend - 1, start, metaend)) - ext3_warning(sb, __func__, - "Inode table (%u-"E3FSBLK") overlaps" - "GDT table ("E3FSBLK"-"E3FSBLK")", - input->inode_table, itend - 1, start, metaend - 1); - else - err = 0; - brelse(bh); - - return err; -} - -static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, - ext3_fsblk_t blk) -{ - struct buffer_head *bh; - int err; - - bh = sb_getblk(sb, blk); - if (unlikely(!bh)) - return ERR_PTR(-ENOMEM); - if ((err = ext3_journal_get_write_access(handle, bh))) { - brelse(bh); - bh = ERR_PTR(err); - } else { - lock_buffer(bh); - memset(bh->b_data, 0, sb->s_blocksize); - set_buffer_uptodate(bh); - unlock_buffer(bh); - } - - return bh; -} - -/* - * To avoid calling the atomic setbit hundreds or thousands of times, we only - * need to use it within a single byte (to ensure we get endianness right). - * We can use memset for the rest of the bitmap as there are no other users. - */ -static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) -{ - int i; - - if (start_bit >= end_bit) - return; - - ext3_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); - for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) - ext3_set_bit(i, bitmap); - if (i < end_bit) - memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); -} - -/* - * If we have fewer than thresh credits, extend by EXT3_MAX_TRANS_DATA. - * If that fails, restart the transaction & regain write access for the - * buffer head which is used for block_bitmap modifications. - */ -static int extend_or_restart_transaction(handle_t *handle, int thresh, - struct buffer_head *bh) -{ - int err; - - if (handle->h_buffer_credits >= thresh) - return 0; - - err = ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA); - if (err < 0) - return err; - if (err) { - err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA); - if (err) - return err; - err = ext3_journal_get_write_access(handle, bh); - if (err) - return err; - } - - return 0; -} - -/* - * Set up the block and inode bitmaps, and the inode table for the new group. - * This doesn't need to be part of the main transaction, since we are only - * changing blocks outside the actual filesystem. We still do journaling to - * ensure the recovery is correct in case of a failure just after resize. - * If any part of this fails, we simply abort the resize. - */ -static int setup_new_group_blocks(struct super_block *sb, - struct ext3_new_group_data *input) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - ext3_fsblk_t start = ext3_group_first_block_no(sb, input->group); - int reserved_gdb = ext3_bg_has_super(sb, input->group) ? - le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0; - unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group); - struct buffer_head *bh; - handle_t *handle; - ext3_fsblk_t block; - ext3_grpblk_t bit; - int i; - int err = 0, err2; - - /* This transaction may be extended/restarted along the way */ - handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA); - - if (IS_ERR(handle)) - return PTR_ERR(handle); - - mutex_lock(&sbi->s_resize_lock); - if (input->group != sbi->s_groups_count) { - err = -EBUSY; - goto exit_journal; - } - - if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) { - err = PTR_ERR(bh); - goto exit_journal; - } - - if (ext3_bg_has_super(sb, input->group)) { - ext3_debug("mark backup superblock %#04lx (+0)\n", start); - ext3_set_bit(0, bh->b_data); - } - - /* Copy all of the GDT blocks into the backup in this group */ - for (i = 0, bit = 1, block = start + 1; - i < gdblocks; i++, block++, bit++) { - struct buffer_head *gdb; - - ext3_debug("update backup group %#04lx (+%d)\n", block, bit); - - err = extend_or_restart_transaction(handle, 1, bh); - if (err) - goto exit_bh; - - gdb = sb_getblk(sb, block); - if (unlikely(!gdb)) { - err = -ENOMEM; - goto exit_bh; - } - if ((err = ext3_journal_get_write_access(handle, gdb))) { - brelse(gdb); - goto exit_bh; - } - lock_buffer(gdb); - memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); - set_buffer_uptodate(gdb); - unlock_buffer(gdb); - err = ext3_journal_dirty_metadata(handle, gdb); - if (err) { - brelse(gdb); - goto exit_bh; - } - ext3_set_bit(bit, bh->b_data); - brelse(gdb); - } - - /* Zero out all of the reserved backup group descriptor table blocks */ - for (i = 0, bit = gdblocks + 1, block = start + bit; - i < reserved_gdb; i++, block++, bit++) { - struct buffer_head *gdb; - - ext3_debug("clear reserved block %#04lx (+%d)\n", block, bit); - - err = extend_or_restart_transaction(handle, 1, bh); - if (err) - goto exit_bh; - - if (IS_ERR(gdb = bclean(handle, sb, block))) { - err = PTR_ERR(gdb); - goto exit_bh; - } - err = ext3_journal_dirty_metadata(handle, gdb); - if (err) { - brelse(gdb); - goto exit_bh; - } - ext3_set_bit(bit, bh->b_data); - brelse(gdb); - } - ext3_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap, - input->block_bitmap - start); - ext3_set_bit(input->block_bitmap - start, bh->b_data); - ext3_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap, - input->inode_bitmap - start); - ext3_set_bit(input->inode_bitmap - start, bh->b_data); - - /* Zero out all of the inode table blocks */ - for (i = 0, block = input->inode_table, bit = block - start; - i < sbi->s_itb_per_group; i++, bit++, block++) { - struct buffer_head *it; - - ext3_debug("clear inode block %#04lx (+%d)\n", block, bit); - - err = extend_or_restart_transaction(handle, 1, bh); - if (err) - goto exit_bh; - - if (IS_ERR(it = bclean(handle, sb, block))) { - err = PTR_ERR(it); - goto exit_bh; - } - err = ext3_journal_dirty_metadata(handle, it); - if (err) { - brelse(it); - goto exit_bh; - } - brelse(it); - ext3_set_bit(bit, bh->b_data); - } - - err = extend_or_restart_transaction(handle, 2, bh); - if (err) - goto exit_bh; - - mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb), - bh->b_data); - err = ext3_journal_dirty_metadata(handle, bh); - if (err) - goto exit_bh; - brelse(bh); - - /* Mark unused entries in inode bitmap used */ - ext3_debug("clear inode bitmap %#04x (+%ld)\n", - input->inode_bitmap, input->inode_bitmap - start); - if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) { - err = PTR_ERR(bh); - goto exit_journal; - } - - mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb), - bh->b_data); - err = ext3_journal_dirty_metadata(handle, bh); -exit_bh: - brelse(bh); - -exit_journal: - mutex_unlock(&sbi->s_resize_lock); - if ((err2 = ext3_journal_stop(handle)) && !err) - err = err2; - - return err; -} - -/* - * Iterate through the groups which hold BACKUP superblock/GDT copies in an - * ext3 filesystem. The counters should be initialized to 1, 5, and 7 before - * calling this for the first time. In a sparse filesystem it will be the - * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ... - * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ... - */ -static unsigned ext3_list_backups(struct super_block *sb, unsigned *three, - unsigned *five, unsigned *seven) -{ - unsigned *min = three; - int mult = 3; - unsigned ret; - - if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, - EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) { - ret = *min; - *min += 1; - return ret; - } - - if (*five < *min) { - min = five; - mult = 5; - } - if (*seven < *min) { - min = seven; - mult = 7; - } - - ret = *min; - *min *= mult; - - return ret; -} - -/* - * Check that all of the backup GDT blocks are held in the primary GDT block. - * It is assumed that they are stored in group order. Returns the number of - * groups in current filesystem that have BACKUPS, or -ve error code. - */ -static int verify_reserved_gdb(struct super_block *sb, - struct buffer_head *primary) -{ - const ext3_fsblk_t blk = primary->b_blocknr; - const unsigned long end = EXT3_SB(sb)->s_groups_count; - unsigned three = 1; - unsigned five = 5; - unsigned seven = 7; - unsigned grp; - __le32 *p = (__le32 *)primary->b_data; - int gdbackups = 0; - - while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) { - if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){ - ext3_warning(sb, __func__, - "reserved GDT "E3FSBLK - " missing grp %d ("E3FSBLK")", - blk, grp, - grp * EXT3_BLOCKS_PER_GROUP(sb) + blk); - return -EINVAL; - } - if (++gdbackups > EXT3_ADDR_PER_BLOCK(sb)) - return -EFBIG; - } - - return gdbackups; -} - -/* - * Called when we need to bring a reserved group descriptor table block into - * use from the resize inode. The primary copy of the new GDT block currently - * is an indirect block (under the double indirect block in the resize inode). - * The new backup GDT blocks will be stored as leaf blocks in this indirect - * block, in group order. Even though we know all the block numbers we need, - * we check to ensure that the resize inode has actually reserved these blocks. - * - * Don't need to update the block bitmaps because the blocks are still in use. - * - * We get all of the error cases out of the way, so that we are sure to not - * fail once we start modifying the data on disk, because JBD has no rollback. - */ -static int add_new_gdb(handle_t *handle, struct inode *inode, - struct ext3_new_group_data *input, - struct buffer_head **primary) -{ - struct super_block *sb = inode->i_sb; - struct ext3_super_block *es = EXT3_SB(sb)->s_es; - unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb); - ext3_fsblk_t gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; - struct buffer_head **o_group_desc, **n_group_desc; - struct buffer_head *dind; - int gdbackups; - struct ext3_iloc iloc; - __le32 *data; - int err; - - if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG - "EXT3-fs: ext3_add_new_gdb: adding group block %lu\n", - gdb_num); - - /* - * If we are not using the primary superblock/GDT copy don't resize, - * because the user tools have no way of handling this. Probably a - * bad time to do it anyways. - */ - if (EXT3_SB(sb)->s_sbh->b_blocknr != - le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) { - ext3_warning(sb, __func__, - "won't resize using backup superblock at %llu", - (unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr); - return -EPERM; - } - - *primary = sb_bread(sb, gdblock); - if (!*primary) - return -EIO; - - if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) { - err = gdbackups; - goto exit_bh; - } - - data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK; - dind = sb_bread(sb, le32_to_cpu(*data)); - if (!dind) { - err = -EIO; - goto exit_bh; - } - - data = (__le32 *)dind->b_data; - if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) { - ext3_warning(sb, __func__, - "new group %u GDT block "E3FSBLK" not reserved", - input->group, gdblock); - err = -EINVAL; - goto exit_dind; - } - - if ((err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh))) - goto exit_dind; - - if ((err = ext3_journal_get_write_access(handle, *primary))) - goto exit_sbh; - - if ((err = ext3_journal_get_write_access(handle, dind))) - goto exit_primary; - - /* ext3_reserve_inode_write() gets a reference on the iloc */ - if ((err = ext3_reserve_inode_write(handle, inode, &iloc))) - goto exit_dindj; - - n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), - GFP_NOFS); - if (!n_group_desc) { - err = -ENOMEM; - ext3_warning (sb, __func__, - "not enough memory for %lu groups", gdb_num + 1); - goto exit_inode; - } - - /* - * Finally, we have all of the possible failures behind us... - * - * Remove new GDT block from inode double-indirect block and clear out - * the new GDT block for use (which also "frees" the backup GDT blocks - * from the reserved inode). We don't need to change the bitmaps for - * these blocks, because they are marked as in-use from being in the - * reserved inode, and will become GDT blocks (primary and backup). - */ - data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0; - err = ext3_journal_dirty_metadata(handle, dind); - if (err) - goto exit_group_desc; - brelse(dind); - dind = NULL; - inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; - err = ext3_mark_iloc_dirty(handle, inode, &iloc); - if (err) - goto exit_group_desc; - memset((*primary)->b_data, 0, sb->s_blocksize); - err = ext3_journal_dirty_metadata(handle, *primary); - if (err) - goto exit_group_desc; - - o_group_desc = EXT3_SB(sb)->s_group_desc; - memcpy(n_group_desc, o_group_desc, - EXT3_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); - n_group_desc[gdb_num] = *primary; - EXT3_SB(sb)->s_group_desc = n_group_desc; - EXT3_SB(sb)->s_gdb_count++; - kfree(o_group_desc); - - le16_add_cpu(&es->s_reserved_gdt_blocks, -1); - err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); - if (err) - goto exit_inode; - - return 0; - -exit_group_desc: - kfree(n_group_desc); -exit_inode: - //ext3_journal_release_buffer(handle, iloc.bh); - brelse(iloc.bh); -exit_dindj: - //ext3_journal_release_buffer(handle, dind); -exit_primary: - //ext3_journal_release_buffer(handle, *primary); -exit_sbh: - //ext3_journal_release_buffer(handle, *primary); -exit_dind: - brelse(dind); -exit_bh: - brelse(*primary); - - ext3_debug("leaving with error %d\n", err); - return err; -} - -/* - * Called when we are adding a new group which has a backup copy of each of - * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. - * We need to add these reserved backup GDT blocks to the resize inode, so - * that they are kept for future resizing and not allocated to files. - * - * Each reserved backup GDT block will go into a different indirect block. - * The indirect blocks are actually the primary reserved GDT blocks, - * so we know in advance what their block numbers are. We only get the - * double-indirect block to verify it is pointing to the primary reserved - * GDT blocks so we don't overwrite a data block by accident. The reserved - * backup GDT blocks are stored in their reserved primary GDT block. - */ -static int reserve_backup_gdb(handle_t *handle, struct inode *inode, - struct ext3_new_group_data *input) -{ - struct super_block *sb = inode->i_sb; - int reserved_gdb =le16_to_cpu(EXT3_SB(sb)->s_es->s_reserved_gdt_blocks); - struct buffer_head **primary; - struct buffer_head *dind; - struct ext3_iloc iloc; - ext3_fsblk_t blk; - __le32 *data, *end; - int gdbackups = 0; - int res, i; - int err; - - primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS); - if (!primary) - return -ENOMEM; - - data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK; - dind = sb_bread(sb, le32_to_cpu(*data)); - if (!dind) { - err = -EIO; - goto exit_free; - } - - blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count; - data = (__le32 *)dind->b_data + (EXT3_SB(sb)->s_gdb_count % - EXT3_ADDR_PER_BLOCK(sb)); - end = (__le32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb); - - /* Get each reserved primary GDT block and verify it holds backups */ - for (res = 0; res < reserved_gdb; res++, blk++) { - if (le32_to_cpu(*data) != blk) { - ext3_warning(sb, __func__, - "reserved block "E3FSBLK - " not at offset %ld", - blk, - (long)(data - (__le32 *)dind->b_data)); - err = -EINVAL; - goto exit_bh; - } - primary[res] = sb_bread(sb, blk); - if (!primary[res]) { - err = -EIO; - goto exit_bh; - } - if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) { - brelse(primary[res]); - err = gdbackups; - goto exit_bh; - } - if (++data >= end) - data = (__le32 *)dind->b_data; - } - - for (i = 0; i < reserved_gdb; i++) { - if ((err = ext3_journal_get_write_access(handle, primary[i]))) { - /* - int j; - for (j = 0; j < i; j++) - ext3_journal_release_buffer(handle, primary[j]); - */ - goto exit_bh; - } - } - - if ((err = ext3_reserve_inode_write(handle, inode, &iloc))) - goto exit_bh; - - /* - * Finally we can add each of the reserved backup GDT blocks from - * the new group to its reserved primary GDT block. - */ - blk = input->group * EXT3_BLOCKS_PER_GROUP(sb); - for (i = 0; i < reserved_gdb; i++) { - int err2; - data = (__le32 *)primary[i]->b_data; - /* printk("reserving backup %lu[%u] = %lu\n", - primary[i]->b_blocknr, gdbackups, - blk + primary[i]->b_blocknr); */ - data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); - err2 = ext3_journal_dirty_metadata(handle, primary[i]); - if (!err) - err = err2; - } - inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9; - ext3_mark_iloc_dirty(handle, inode, &iloc); - -exit_bh: - while (--res >= 0) - brelse(primary[res]); - brelse(dind); - -exit_free: - kfree(primary); - - return err; -} - -/* - * Update the backup copies of the ext3 metadata. These don't need to be part - * of the main resize transaction, because e2fsck will re-write them if there - * is a problem (basically only OOM will cause a problem). However, we - * _should_ update the backups if possible, in case the primary gets trashed - * for some reason and we need to run e2fsck from a backup superblock. The - * important part is that the new block and inode counts are in the backup - * superblocks, and the location of the new group metadata in the GDT backups. - * - * We do not need take the s_resize_lock for this, because these - * blocks are not otherwise touched by the filesystem code when it is - * mounted. We don't need to worry about last changing from - * sbi->s_groups_count, because the worst that can happen is that we - * do not copy the full number of backups at this time. The resize - * which changed s_groups_count will backup again. - */ -static void update_backups(struct super_block *sb, - int blk_off, char *data, int size) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - const unsigned long last = sbi->s_groups_count; - const int bpg = EXT3_BLOCKS_PER_GROUP(sb); - unsigned three = 1; - unsigned five = 5; - unsigned seven = 7; - unsigned group; - int rest = sb->s_blocksize - size; - handle_t *handle; - int err = 0, err2; - - handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA); - if (IS_ERR(handle)) { - group = 1; - err = PTR_ERR(handle); - goto exit_err; - } - - while ((group = ext3_list_backups(sb, &three, &five, &seven)) < last) { - struct buffer_head *bh; - - /* Out of journal space, and can't get more - abort - so sad */ - if (handle->h_buffer_credits == 0 && - ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA) && - (err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA))) - break; - - bh = sb_getblk(sb, group * bpg + blk_off); - if (unlikely(!bh)) { - err = -ENOMEM; - break; - } - ext3_debug("update metadata backup %#04lx\n", - (unsigned long)bh->b_blocknr); - if ((err = ext3_journal_get_write_access(handle, bh))) { - brelse(bh); - break; - } - lock_buffer(bh); - memcpy(bh->b_data, data, size); - if (rest) - memset(bh->b_data + size, 0, rest); - set_buffer_uptodate(bh); - unlock_buffer(bh); - err = ext3_journal_dirty_metadata(handle, bh); - brelse(bh); - if (err) - break; - } - if ((err2 = ext3_journal_stop(handle)) && !err) - err = err2; - - /* - * Ugh! Need to have e2fsck write the backup copies. It is too - * late to revert the resize, we shouldn't fail just because of - * the backup copies (they are only needed in case of corruption). - * - * However, if we got here we have a journal problem too, so we - * can't really start a transaction to mark the superblock. - * Chicken out and just set the flag on the hope it will be written - * to disk, and if not - we will simply wait until next fsck. - */ -exit_err: - if (err) { - ext3_warning(sb, __func__, - "can't update backup for group %d (err %d), " - "forcing fsck on next reboot", group, err); - sbi->s_mount_state &= ~EXT3_VALID_FS; - sbi->s_es->s_state &= cpu_to_le16(~EXT3_VALID_FS); - mark_buffer_dirty(sbi->s_sbh); - } -} - -/* Add group descriptor data to an existing or new group descriptor block. - * Ensure we handle all possible error conditions _before_ we start modifying - * the filesystem, because we cannot abort the transaction and not have it - * write the data to disk. - * - * If we are on a GDT block boundary, we need to get the reserved GDT block. - * Otherwise, we may need to add backup GDT blocks for a sparse group. - * - * We only need to hold the superblock lock while we are actually adding - * in the new group's counts to the superblock. Prior to that we have - * not really "added" the group at all. We re-check that we are still - * adding in the last group in case things have changed since verifying. - */ -int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - struct ext3_super_block *es = sbi->s_es; - int reserved_gdb = ext3_bg_has_super(sb, input->group) ? - le16_to_cpu(es->s_reserved_gdt_blocks) : 0; - struct buffer_head *primary = NULL; - struct ext3_group_desc *gdp; - struct inode *inode = NULL; - handle_t *handle; - int gdb_off, gdb_num; - int err, err2; - - gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb); - gdb_off = input->group % EXT3_DESC_PER_BLOCK(sb); - - if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb, - EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) { - ext3_warning(sb, __func__, - "Can't resize non-sparse filesystem further"); - return -EPERM; - } - - if (le32_to_cpu(es->s_blocks_count) + input->blocks_count < - le32_to_cpu(es->s_blocks_count)) { - ext3_warning(sb, __func__, "blocks_count overflow\n"); - return -EINVAL; - } - - if (le32_to_cpu(es->s_inodes_count) + EXT3_INODES_PER_GROUP(sb) < - le32_to_cpu(es->s_inodes_count)) { - ext3_warning(sb, __func__, "inodes_count overflow\n"); - return -EINVAL; - } - - if (reserved_gdb || gdb_off == 0) { - if (!EXT3_HAS_COMPAT_FEATURE(sb, - EXT3_FEATURE_COMPAT_RESIZE_INODE) - || !le16_to_cpu(es->s_reserved_gdt_blocks)) { - ext3_warning(sb, __func__, - "No reserved GDT blocks, can't resize"); - return -EPERM; - } - inode = ext3_iget(sb, EXT3_RESIZE_INO); - if (IS_ERR(inode)) { - ext3_warning(sb, __func__, - "Error opening resize inode"); - return PTR_ERR(inode); - } - } - - if ((err = verify_group_input(sb, input))) - goto exit_put; - - if ((err = setup_new_group_blocks(sb, input))) - goto exit_put; - - /* - * We will always be modifying at least the superblock and a GDT - * block. If we are adding a group past the last current GDT block, - * we will also modify the inode and the dindirect block. If we - * are adding a group with superblock/GDT backups we will also - * modify each of the reserved GDT dindirect blocks. - */ - handle = ext3_journal_start_sb(sb, - ext3_bg_has_super(sb, input->group) ? - 3 + reserved_gdb : 4); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto exit_put; - } - - mutex_lock(&sbi->s_resize_lock); - if (input->group != sbi->s_groups_count) { - ext3_warning(sb, __func__, - "multiple resizers run on filesystem!"); - err = -EBUSY; - goto exit_journal; - } - - if ((err = ext3_journal_get_write_access(handle, sbi->s_sbh))) - goto exit_journal; - - /* - * We will only either add reserved group blocks to a backup group - * or remove reserved blocks for the first group in a new group block. - * Doing both would be mean more complex code, and sane people don't - * use non-sparse filesystems anymore. This is already checked above. - */ - if (gdb_off) { - primary = sbi->s_group_desc[gdb_num]; - if ((err = ext3_journal_get_write_access(handle, primary))) - goto exit_journal; - - if (reserved_gdb && ext3_bg_num_gdb(sb, input->group) && - (err = reserve_backup_gdb(handle, inode, input))) - goto exit_journal; - } else if ((err = add_new_gdb(handle, inode, input, &primary))) - goto exit_journal; - - /* - * OK, now we've set up the new group. Time to make it active. - * - * We do not lock all allocations via s_resize_lock - * so we have to be safe wrt. concurrent accesses the group - * data. So we need to be careful to set all of the relevant - * group descriptor data etc. *before* we enable the group. - * - * The key field here is sbi->s_groups_count: as long as - * that retains its old value, nobody is going to access the new - * group. - * - * So first we update all the descriptor metadata for the new - * group; then we update the total disk blocks count; then we - * update the groups count to enable the group; then finally we - * update the free space counts so that the system can start - * using the new disk blocks. - */ - - /* Update group descriptor block for new group */ - gdp = (struct ext3_group_desc *)primary->b_data + gdb_off; - - gdp->bg_block_bitmap = cpu_to_le32(input->block_bitmap); - gdp->bg_inode_bitmap = cpu_to_le32(input->inode_bitmap); - gdp->bg_inode_table = cpu_to_le32(input->inode_table); - gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); - gdp->bg_free_inodes_count = cpu_to_le16(EXT3_INODES_PER_GROUP(sb)); - - /* - * Make the new blocks and inodes valid next. We do this before - * increasing the group count so that once the group is enabled, - * all of its blocks and inodes are already valid. - * - * We always allocate group-by-group, then block-by-block or - * inode-by-inode within a group, so enabling these - * blocks/inodes before the group is live won't actually let us - * allocate the new space yet. - */ - le32_add_cpu(&es->s_blocks_count, input->blocks_count); - le32_add_cpu(&es->s_inodes_count, EXT3_INODES_PER_GROUP(sb)); - - /* - * We need to protect s_groups_count against other CPUs seeing - * inconsistent state in the superblock. - * - * The precise rules we use are: - * - * * Writers of s_groups_count *must* hold s_resize_lock - * AND - * * Writers must perform a smp_wmb() after updating all dependent - * data and before modifying the groups count - * - * * Readers must hold s_resize_lock over the access - * OR - * * Readers must perform an smp_rmb() after reading the groups count - * and before reading any dependent data. - * - * NB. These rules can be relaxed when checking the group count - * while freeing data, as we can only allocate from a block - * group after serialising against the group count, and we can - * only then free after serialising in turn against that - * allocation. - */ - smp_wmb(); - - /* Update the global fs size fields */ - sbi->s_groups_count++; - - err = ext3_journal_dirty_metadata(handle, primary); - if (err) - goto exit_journal; - - /* Update the reserved block counts only once the new group is - * active. */ - le32_add_cpu(&es->s_r_blocks_count, input->reserved_blocks); - - /* Update the free space counts */ - percpu_counter_add(&sbi->s_freeblocks_counter, - input->free_blocks_count); - percpu_counter_add(&sbi->s_freeinodes_counter, - EXT3_INODES_PER_GROUP(sb)); - - err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); - -exit_journal: - mutex_unlock(&sbi->s_resize_lock); - if ((err2 = ext3_journal_stop(handle)) && !err) - err = err2; - if (!err) { - update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, - sizeof(struct ext3_super_block)); - update_backups(sb, primary->b_blocknr, primary->b_data, - primary->b_size); - } -exit_put: - iput(inode); - return err; -} /* ext3_group_add */ - -/* Extend the filesystem to the new number of blocks specified. This entry - * point is only used to extend the current filesystem to the end of the last - * existing group. It can be accessed via ioctl, or by "remount,resize=<size>" - * for emergencies (because it has no dependencies on reserved blocks). - * - * If we _really_ wanted, we could use default values to call ext3_group_add() - * allow the "remount" trick to work for arbitrary resizing, assuming enough - * GDT blocks are reserved to grow to the desired size. - */ -int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, - ext3_fsblk_t n_blocks_count) -{ - ext3_fsblk_t o_blocks_count; - ext3_grpblk_t last; - ext3_grpblk_t add; - struct buffer_head * bh; - handle_t *handle; - int err; - unsigned long freed_blocks; - - /* We don't need to worry about locking wrt other resizers just - * yet: we're going to revalidate es->s_blocks_count after - * taking the s_resize_lock below. */ - o_blocks_count = le32_to_cpu(es->s_blocks_count); - - if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK - " up to "E3FSBLK" blocks\n", - o_blocks_count, n_blocks_count); - - if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) - return 0; - - if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { - printk(KERN_ERR "EXT3-fs: filesystem on %s:" - " too large to resize to "E3FSBLK" blocks safely\n", - sb->s_id, n_blocks_count); - if (sizeof(sector_t) < 8) - ext3_warning(sb, __func__, - "CONFIG_LBDAF not enabled\n"); - return -EINVAL; - } - - if (n_blocks_count < o_blocks_count) { - ext3_warning(sb, __func__, - "can't shrink FS - resize aborted"); - return -EBUSY; - } - - /* Handle the remaining blocks in the last group only. */ - last = (o_blocks_count - le32_to_cpu(es->s_first_data_block)) % - EXT3_BLOCKS_PER_GROUP(sb); - - if (last == 0) { - ext3_warning(sb, __func__, - "need to use ext2online to resize further"); - return -EPERM; - } - - add = EXT3_BLOCKS_PER_GROUP(sb) - last; - - if (o_blocks_count + add < o_blocks_count) { - ext3_warning(sb, __func__, "blocks_count overflow"); - return -EINVAL; - } - - if (o_blocks_count + add > n_blocks_count) - add = n_blocks_count - o_blocks_count; - - if (o_blocks_count + add < n_blocks_count) - ext3_warning(sb, __func__, - "will only finish group ("E3FSBLK - " blocks, %u new)", - o_blocks_count + add, add); - - /* See if the device is actually as big as what was requested */ - bh = sb_bread(sb, o_blocks_count + add -1); - if (!bh) { - ext3_warning(sb, __func__, - "can't read last block, resize aborted"); - return -ENOSPC; - } - brelse(bh); - - /* We will update the superblock, one block bitmap, and - * one group descriptor via ext3_free_blocks(). - */ - handle = ext3_journal_start_sb(sb, 3); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - ext3_warning(sb, __func__, "error %d on journal start",err); - goto exit_put; - } - - mutex_lock(&EXT3_SB(sb)->s_resize_lock); - if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { - ext3_warning(sb, __func__, - "multiple resizers run on filesystem!"); - mutex_unlock(&EXT3_SB(sb)->s_resize_lock); - ext3_journal_stop(handle); - err = -EBUSY; - goto exit_put; - } - - if ((err = ext3_journal_get_write_access(handle, - EXT3_SB(sb)->s_sbh))) { - ext3_warning(sb, __func__, - "error %d on journal write access", err); - mutex_unlock(&EXT3_SB(sb)->s_resize_lock); - ext3_journal_stop(handle); - goto exit_put; - } - es->s_blocks_count = cpu_to_le32(o_blocks_count + add); - err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); - mutex_unlock(&EXT3_SB(sb)->s_resize_lock); - if (err) { - ext3_warning(sb, __func__, - "error %d on journal dirty metadata", err); - ext3_journal_stop(handle); - goto exit_put; - } - ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n", - o_blocks_count, o_blocks_count + add); - ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); - ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", - o_blocks_count, o_blocks_count + add); - if ((err = ext3_journal_stop(handle))) - goto exit_put; - if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT3-fs: extended group to %u blocks\n", - le32_to_cpu(es->s_blocks_count)); - update_backups(sb, EXT3_SB(sb)->s_sbh->b_blocknr, (char *)es, - sizeof(struct ext3_super_block)); -exit_put: - return err; -} /* ext3_group_extend */ diff --git a/fs/ext3/super.c b/fs/ext3/super.c deleted file mode 100644 index 5ed0044fbb37..000000000000 --- a/fs/ext3/super.c +++ /dev/null @@ -1,3165 +0,0 @@ -/* - * linux/fs/ext3/super.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/inode.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - */ - -#include <linux/module.h> -#include <linux/blkdev.h> -#include <linux/parser.h> -#include <linux/exportfs.h> -#include <linux/statfs.h> -#include <linux/random.h> -#include <linux/mount.h> -#include <linux/quotaops.h> -#include <linux/seq_file.h> -#include <linux/log2.h> -#include <linux/cleancache.h> -#include <linux/namei.h> - -#include <asm/uaccess.h> - -#define CREATE_TRACE_POINTS - -#include "ext3.h" -#include "xattr.h" -#include "acl.h" -#include "namei.h" - -#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED - #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA -#else - #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA -#endif - -static int ext3_load_journal(struct super_block *, struct ext3_super_block *, - unsigned long journal_devnum); -static int ext3_create_journal(struct super_block *, struct ext3_super_block *, - unsigned int); -static int ext3_commit_super(struct super_block *sb, - struct ext3_super_block *es, - int sync); -static void ext3_mark_recovery_complete(struct super_block * sb, - struct ext3_super_block * es); -static void ext3_clear_journal_err(struct super_block * sb, - struct ext3_super_block * es); -static int ext3_sync_fs(struct super_block *sb, int wait); -static const char *ext3_decode_error(struct super_block * sb, int errno, - char nbuf[16]); -static int ext3_remount (struct super_block * sb, int * flags, char * data); -static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf); -static int ext3_unfreeze(struct super_block *sb); -static int ext3_freeze(struct super_block *sb); - -/* - * Wrappers for journal_start/end. - */ -handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks) -{ - journal_t *journal; - - if (sb->s_flags & MS_RDONLY) - return ERR_PTR(-EROFS); - - /* Special case here: if the journal has aborted behind our - * backs (eg. EIO in the commit thread), then we still need to - * take the FS itself readonly cleanly. */ - journal = EXT3_SB(sb)->s_journal; - if (is_journal_aborted(journal)) { - ext3_abort(sb, __func__, - "Detected aborted journal"); - return ERR_PTR(-EROFS); - } - - return journal_start(journal, nblocks); -} - -int __ext3_journal_stop(const char *where, handle_t *handle) -{ - struct super_block *sb; - int err; - int rc; - - sb = handle->h_transaction->t_journal->j_private; - err = handle->h_err; - rc = journal_stop(handle); - - if (!err) - err = rc; - if (err) - __ext3_std_error(sb, where, err); - return err; -} - -void ext3_journal_abort_handle(const char *caller, const char *err_fn, - struct buffer_head *bh, handle_t *handle, int err) -{ - char nbuf[16]; - const char *errstr = ext3_decode_error(NULL, err, nbuf); - - if (bh) - BUFFER_TRACE(bh, "abort"); - - if (!handle->h_err) - handle->h_err = err; - - if (is_handle_aborted(handle)) - return; - - printk(KERN_ERR "EXT3-fs: %s: aborting transaction: %s in %s\n", - caller, errstr, err_fn); - - journal_abort_handle(handle); -} - -void ext3_msg(struct super_block *sb, const char *prefix, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf); - - va_end(args); -} - -/* Deal with the reporting of failure conditions on a filesystem such as - * inconsistencies detected or read IO failures. - * - * On ext2, we can store the error state of the filesystem in the - * superblock. That is not possible on ext3, because we may have other - * write ordering constraints on the superblock which prevent us from - * writing it out straight away; and given that the journal is about to - * be aborted, we can't rely on the current, or future, transactions to - * write out the superblock safely. - * - * We'll just use the journal_abort() error code to record an error in - * the journal instead. On recovery, the journal will complain about - * that error until we've noted it down and cleared it. - */ - -static void ext3_handle_error(struct super_block *sb) -{ - struct ext3_super_block *es = EXT3_SB(sb)->s_es; - - EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; - es->s_state |= cpu_to_le16(EXT3_ERROR_FS); - - if (sb->s_flags & MS_RDONLY) - return; - - if (!test_opt (sb, ERRORS_CONT)) { - journal_t *journal = EXT3_SB(sb)->s_journal; - - set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); - if (journal) - journal_abort(journal, -EIO); - } - if (test_opt (sb, ERRORS_RO)) { - ext3_msg(sb, KERN_CRIT, - "error: remounting filesystem read-only"); - /* - * Make sure updated value of ->s_mount_state will be visible - * before ->s_flags update. - */ - smp_wmb(); - sb->s_flags |= MS_RDONLY; - } - ext3_commit_super(sb, es, 1); - if (test_opt(sb, ERRORS_PANIC)) - panic("EXT3-fs (%s): panic forced after error\n", - sb->s_id); -} - -void ext3_error(struct super_block *sb, const char *function, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n", - sb->s_id, function, &vaf); - - va_end(args); - - ext3_handle_error(sb); -} - -static const char *ext3_decode_error(struct super_block * sb, int errno, - char nbuf[16]) -{ - char *errstr = NULL; - - switch (errno) { - case -EIO: - errstr = "IO failure"; - break; - case -ENOMEM: - errstr = "Out of memory"; - break; - case -EROFS: - if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT) - errstr = "Journal has aborted"; - else - errstr = "Readonly filesystem"; - break; - default: - /* If the caller passed in an extra buffer for unknown - * errors, textualise them now. Else we just return - * NULL. */ - if (nbuf) { - /* Check for truncated error codes... */ - if (snprintf(nbuf, 16, "error %d", -errno) >= 0) - errstr = nbuf; - } - break; - } - - return errstr; -} - -/* __ext3_std_error decodes expected errors from journaling functions - * automatically and invokes the appropriate error response. */ - -void __ext3_std_error (struct super_block * sb, const char * function, - int errno) -{ - char nbuf[16]; - const char *errstr; - - /* Special case: if the error is EROFS, and we're not already - * inside a transaction, then there's really no point in logging - * an error. */ - if (errno == -EROFS && journal_current_handle() == NULL && - (sb->s_flags & MS_RDONLY)) - return; - - errstr = ext3_decode_error(sb, errno, nbuf); - ext3_msg(sb, KERN_CRIT, "error in %s: %s", function, errstr); - - ext3_handle_error(sb); -} - -/* - * ext3_abort is a much stronger failure handler than ext3_error. The - * abort function may be used to deal with unrecoverable failures such - * as journal IO errors or ENOMEM at a critical moment in log management. - * - * We unconditionally force the filesystem into an ABORT|READONLY state, - * unless the error response on the fs has been set to panic in which - * case we take the easy way out and panic immediately. - */ - -void ext3_abort(struct super_block *sb, const char *function, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n", - sb->s_id, function, &vaf); - - va_end(args); - - if (test_opt(sb, ERRORS_PANIC)) - panic("EXT3-fs: panic from previous error\n"); - - if (sb->s_flags & MS_RDONLY) - return; - - ext3_msg(sb, KERN_CRIT, - "error: remounting filesystem read-only"); - EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; - set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); - /* - * Make sure updated value of ->s_mount_state will be visible - * before ->s_flags update. - */ - smp_wmb(); - sb->s_flags |= MS_RDONLY; - - if (EXT3_SB(sb)->s_journal) - journal_abort(EXT3_SB(sb)->s_journal, -EIO); -} - -void ext3_warning(struct super_block *sb, const char *function, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n", - sb->s_id, function, &vaf); - - va_end(args); -} - -void ext3_update_dynamic_rev(struct super_block *sb) -{ - struct ext3_super_block *es = EXT3_SB(sb)->s_es; - - if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV) - return; - - ext3_msg(sb, KERN_WARNING, - "warning: updating to rev %d because of " - "new feature flag, running e2fsck is recommended", - EXT3_DYNAMIC_REV); - - es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO); - es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE); - es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV); - /* leave es->s_feature_*compat flags alone */ - /* es->s_uuid will be set by e2fsck if empty */ - - /* - * The rest of the superblock fields should be zero, and if not it - * means they are likely already in use, so leave them alone. We - * can leave it up to e2fsck to clean up any inconsistencies there. - */ -} - -/* - * Open the external journal device - */ -static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb) -{ - struct block_device *bdev; - char b[BDEVNAME_SIZE]; - - bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); - if (IS_ERR(bdev)) - goto fail; - return bdev; - -fail: - ext3_msg(sb, KERN_ERR, "error: failed to open journal device %s: %ld", - __bdevname(dev, b), PTR_ERR(bdev)); - - return NULL; -} - -/* - * Release the journal device - */ -static void ext3_blkdev_put(struct block_device *bdev) -{ - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); -} - -static void ext3_blkdev_remove(struct ext3_sb_info *sbi) -{ - struct block_device *bdev; - bdev = sbi->journal_bdev; - if (bdev) { - ext3_blkdev_put(bdev); - sbi->journal_bdev = NULL; - } -} - -static inline struct inode *orphan_list_entry(struct list_head *l) -{ - return &list_entry(l, struct ext3_inode_info, i_orphan)->vfs_inode; -} - -static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi) -{ - struct list_head *l; - - ext3_msg(sb, KERN_ERR, "error: sb orphan head is %d", - le32_to_cpu(sbi->s_es->s_last_orphan)); - - ext3_msg(sb, KERN_ERR, "sb_info orphan list:"); - list_for_each(l, &sbi->s_orphan) { - struct inode *inode = orphan_list_entry(l); - ext3_msg(sb, KERN_ERR, " " - "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", - inode->i_sb->s_id, inode->i_ino, inode, - inode->i_mode, inode->i_nlink, - NEXT_ORPHAN(inode)); - } -} - -static void ext3_put_super (struct super_block * sb) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - struct ext3_super_block *es = sbi->s_es; - int i, err; - - dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - ext3_xattr_put_super(sb); - err = journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; - if (err < 0) - ext3_abort(sb, __func__, "Couldn't clean up the journal"); - - if (!(sb->s_flags & MS_RDONLY)) { - EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); - es->s_state = cpu_to_le16(sbi->s_mount_state); - BUFFER_TRACE(sbi->s_sbh, "marking dirty"); - mark_buffer_dirty(sbi->s_sbh); - ext3_commit_super(sb, es, 1); - } - - for (i = 0; i < sbi->s_gdb_count; i++) - brelse(sbi->s_group_desc[i]); - kfree(sbi->s_group_desc); - percpu_counter_destroy(&sbi->s_freeblocks_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - brelse(sbi->s_sbh); -#ifdef CONFIG_QUOTA - for (i = 0; i < EXT3_MAXQUOTAS; i++) - kfree(sbi->s_qf_names[i]); -#endif - - /* Debugging code just in case the in-memory inode orphan list - * isn't empty. The on-disk one can be non-empty if we've - * detected an error and taken the fs readonly, but the - * in-memory list had better be clean by this point. */ - if (!list_empty(&sbi->s_orphan)) - dump_orphan_list(sb, sbi); - J_ASSERT(list_empty(&sbi->s_orphan)); - - invalidate_bdev(sb->s_bdev); - if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) { - /* - * Invalidate the journal device's buffers. We don't want them - * floating about in memory - the physical journal device may - * hotswapped, and it breaks the `ro-after' testing code. - */ - sync_blockdev(sbi->journal_bdev); - invalidate_bdev(sbi->journal_bdev); - ext3_blkdev_remove(sbi); - } - sb->s_fs_info = NULL; - kfree(sbi->s_blockgroup_lock); - mutex_destroy(&sbi->s_orphan_lock); - mutex_destroy(&sbi->s_resize_lock); - kfree(sbi); -} - -static struct kmem_cache *ext3_inode_cachep; - -/* - * Called inside transaction, so use GFP_NOFS - */ -static struct inode *ext3_alloc_inode(struct super_block *sb) -{ - struct ext3_inode_info *ei; - - ei = kmem_cache_alloc(ext3_inode_cachep, GFP_NOFS); - if (!ei) - return NULL; - ei->i_block_alloc_info = NULL; - ei->vfs_inode.i_version = 1; - atomic_set(&ei->i_datasync_tid, 0); - atomic_set(&ei->i_sync_tid, 0); -#ifdef CONFIG_QUOTA - memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); -#endif - - return &ei->vfs_inode; -} - -static int ext3_drop_inode(struct inode *inode) -{ - int drop = generic_drop_inode(inode); - - trace_ext3_drop_inode(inode, drop); - return drop; -} - -static void ext3_i_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); -} - -static void ext3_destroy_inode(struct inode *inode) -{ - if (!list_empty(&(EXT3_I(inode)->i_orphan))) { - printk("EXT3 Inode %p: orphan list check failed!\n", - EXT3_I(inode)); - print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, - EXT3_I(inode), sizeof(struct ext3_inode_info), - false); - dump_stack(); - } - call_rcu(&inode->i_rcu, ext3_i_callback); -} - -static void init_once(void *foo) -{ - struct ext3_inode_info *ei = (struct ext3_inode_info *) foo; - - INIT_LIST_HEAD(&ei->i_orphan); -#ifdef CONFIG_EXT3_FS_XATTR - init_rwsem(&ei->xattr_sem); -#endif - mutex_init(&ei->truncate_mutex); - inode_init_once(&ei->vfs_inode); -} - -static int __init init_inodecache(void) -{ - ext3_inode_cachep = kmem_cache_create("ext3_inode_cache", - sizeof(struct ext3_inode_info), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), - init_once); - if (ext3_inode_cachep == NULL) - return -ENOMEM; - return 0; -} - -static void destroy_inodecache(void) -{ - /* - * Make sure all delayed rcu free inodes are flushed before we - * destroy cache. - */ - rcu_barrier(); - kmem_cache_destroy(ext3_inode_cachep); -} - -static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb) -{ -#if defined(CONFIG_QUOTA) - struct ext3_sb_info *sbi = EXT3_SB(sb); - - if (sbi->s_jquota_fmt) { - char *fmtname = ""; - - switch (sbi->s_jquota_fmt) { - case QFMT_VFS_OLD: - fmtname = "vfsold"; - break; - case QFMT_VFS_V0: - fmtname = "vfsv0"; - break; - case QFMT_VFS_V1: - fmtname = "vfsv1"; - break; - } - seq_printf(seq, ",jqfmt=%s", fmtname); - } - - if (sbi->s_qf_names[USRQUOTA]) - seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); - - if (sbi->s_qf_names[GRPQUOTA]) - seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); - - if (test_opt(sb, USRQUOTA)) - seq_puts(seq, ",usrquota"); - - if (test_opt(sb, GRPQUOTA)) - seq_puts(seq, ",grpquota"); -#endif -} - -static char *data_mode_string(unsigned long mode) -{ - switch (mode) { - case EXT3_MOUNT_JOURNAL_DATA: - return "journal"; - case EXT3_MOUNT_ORDERED_DATA: - return "ordered"; - case EXT3_MOUNT_WRITEBACK_DATA: - return "writeback"; - } - return "unknown"; -} - -/* - * Show an option if - * - it's set to a non-default value OR - * - if the per-sb default is different from the global default - */ -static int ext3_show_options(struct seq_file *seq, struct dentry *root) -{ - struct super_block *sb = root->d_sb; - struct ext3_sb_info *sbi = EXT3_SB(sb); - struct ext3_super_block *es = sbi->s_es; - unsigned long def_mount_opts; - - def_mount_opts = le32_to_cpu(es->s_default_mount_opts); - - if (sbi->s_sb_block != 1) - seq_printf(seq, ",sb=%lu", sbi->s_sb_block); - if (test_opt(sb, MINIX_DF)) - seq_puts(seq, ",minixdf"); - if (test_opt(sb, GRPID)) - seq_puts(seq, ",grpid"); - if (!test_opt(sb, GRPID) && (def_mount_opts & EXT3_DEFM_BSDGROUPS)) - seq_puts(seq, ",nogrpid"); - if (!uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT3_DEF_RESUID)) || - le16_to_cpu(es->s_def_resuid) != EXT3_DEF_RESUID) { - seq_printf(seq, ",resuid=%u", - from_kuid_munged(&init_user_ns, sbi->s_resuid)); - } - if (!gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT3_DEF_RESGID)) || - le16_to_cpu(es->s_def_resgid) != EXT3_DEF_RESGID) { - seq_printf(seq, ",resgid=%u", - from_kgid_munged(&init_user_ns, sbi->s_resgid)); - } - if (test_opt(sb, ERRORS_RO)) { - int def_errors = le16_to_cpu(es->s_errors); - - if (def_errors == EXT3_ERRORS_PANIC || - def_errors == EXT3_ERRORS_CONTINUE) { - seq_puts(seq, ",errors=remount-ro"); - } - } - if (test_opt(sb, ERRORS_CONT)) - seq_puts(seq, ",errors=continue"); - if (test_opt(sb, ERRORS_PANIC)) - seq_puts(seq, ",errors=panic"); - if (test_opt(sb, NO_UID32)) - seq_puts(seq, ",nouid32"); - if (test_opt(sb, DEBUG)) - seq_puts(seq, ",debug"); -#ifdef CONFIG_EXT3_FS_XATTR - if (test_opt(sb, XATTR_USER)) - seq_puts(seq, ",user_xattr"); - if (!test_opt(sb, XATTR_USER) && - (def_mount_opts & EXT3_DEFM_XATTR_USER)) { - seq_puts(seq, ",nouser_xattr"); - } -#endif -#ifdef CONFIG_EXT3_FS_POSIX_ACL - if (test_opt(sb, POSIX_ACL)) - seq_puts(seq, ",acl"); - if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT3_DEFM_ACL)) - seq_puts(seq, ",noacl"); -#endif - if (!test_opt(sb, RESERVATION)) - seq_puts(seq, ",noreservation"); - if (sbi->s_commit_interval) { - seq_printf(seq, ",commit=%u", - (unsigned) (sbi->s_commit_interval / HZ)); - } - - /* - * Always display barrier state so it's clear what the status is. - */ - seq_puts(seq, ",barrier="); - seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); - seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS))); - if (test_opt(sb, DATA_ERR_ABORT)) - seq_puts(seq, ",data_err=abort"); - - if (test_opt(sb, NOLOAD)) - seq_puts(seq, ",norecovery"); - - ext3_show_quota_options(seq, sb); - - return 0; -} - - -static struct inode *ext3_nfs_get_inode(struct super_block *sb, - u64 ino, u32 generation) -{ - struct inode *inode; - - if (ino < EXT3_FIRST_INO(sb) && ino != EXT3_ROOT_INO) - return ERR_PTR(-ESTALE); - if (ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)) - return ERR_PTR(-ESTALE); - - /* iget isn't really right if the inode is currently unallocated!! - * - * ext3_read_inode will return a bad_inode if the inode had been - * deleted, so we should be safe. - * - * Currently we don't know the generation for parent directory, so - * a generation of 0 means "accept any" - */ - inode = ext3_iget(sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); - if (generation && inode->i_generation != generation) { - iput(inode); - return ERR_PTR(-ESTALE); - } - - return inode; -} - -static struct dentry *ext3_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - return generic_fh_to_dentry(sb, fid, fh_len, fh_type, - ext3_nfs_get_inode); -} - -static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - return generic_fh_to_parent(sb, fid, fh_len, fh_type, - ext3_nfs_get_inode); -} - -/* - * Try to release metadata pages (indirect blocks, directories) which are - * mapped via the block device. Since these pages could have journal heads - * which would prevent try_to_free_buffers() from freeing them, we must use - * jbd layer's try_to_free_buffers() function to release them. - */ -static int bdev_try_to_free_page(struct super_block *sb, struct page *page, - gfp_t wait) -{ - journal_t *journal = EXT3_SB(sb)->s_journal; - - WARN_ON(PageChecked(page)); - if (!page_has_buffers(page)) - return 0; - if (journal) - return journal_try_to_free_buffers(journal, page, - wait & ~__GFP_WAIT); - return try_to_free_buffers(page); -} - -#ifdef CONFIG_QUOTA -#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") -#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) - -static int ext3_write_dquot(struct dquot *dquot); -static int ext3_acquire_dquot(struct dquot *dquot); -static int ext3_release_dquot(struct dquot *dquot); -static int ext3_mark_dquot_dirty(struct dquot *dquot); -static int ext3_write_info(struct super_block *sb, int type); -static int ext3_quota_on(struct super_block *sb, int type, int format_id, - struct path *path); -static int ext3_quota_on_mount(struct super_block *sb, int type); -static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, - size_t len, loff_t off); -static ssize_t ext3_quota_write(struct super_block *sb, int type, - const char *data, size_t len, loff_t off); -static struct dquot **ext3_get_dquots(struct inode *inode) -{ - return EXT3_I(inode)->i_dquot; -} - -static const struct dquot_operations ext3_quota_operations = { - .write_dquot = ext3_write_dquot, - .acquire_dquot = ext3_acquire_dquot, - .release_dquot = ext3_release_dquot, - .mark_dirty = ext3_mark_dquot_dirty, - .write_info = ext3_write_info, - .alloc_dquot = dquot_alloc, - .destroy_dquot = dquot_destroy, -}; - -static const struct quotactl_ops ext3_qctl_operations = { - .quota_on = ext3_quota_on, - .quota_off = dquot_quota_off, - .quota_sync = dquot_quota_sync, - .get_state = dquot_get_state, - .set_info = dquot_set_dqinfo, - .get_dqblk = dquot_get_dqblk, - .set_dqblk = dquot_set_dqblk -}; -#endif - -static const struct super_operations ext3_sops = { - .alloc_inode = ext3_alloc_inode, - .destroy_inode = ext3_destroy_inode, - .write_inode = ext3_write_inode, - .dirty_inode = ext3_dirty_inode, - .drop_inode = ext3_drop_inode, - .evict_inode = ext3_evict_inode, - .put_super = ext3_put_super, - .sync_fs = ext3_sync_fs, - .freeze_fs = ext3_freeze, - .unfreeze_fs = ext3_unfreeze, - .statfs = ext3_statfs, - .remount_fs = ext3_remount, - .show_options = ext3_show_options, -#ifdef CONFIG_QUOTA - .quota_read = ext3_quota_read, - .quota_write = ext3_quota_write, - .get_dquots = ext3_get_dquots, -#endif - .bdev_try_to_free_page = bdev_try_to_free_page, -}; - -static const struct export_operations ext3_export_ops = { - .fh_to_dentry = ext3_fh_to_dentry, - .fh_to_parent = ext3_fh_to_parent, - .get_parent = ext3_get_parent, -}; - -enum { - Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, - Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, - Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, - Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, - Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, - Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, - Opt_journal_path, - Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_data_err_abort, Opt_data_err_ignore, - Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, - Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_resize, Opt_usrquota, Opt_grpquota -}; - -static const match_table_t tokens = { - {Opt_bsd_df, "bsddf"}, - {Opt_minix_df, "minixdf"}, - {Opt_grpid, "grpid"}, - {Opt_grpid, "bsdgroups"}, - {Opt_nogrpid, "nogrpid"}, - {Opt_nogrpid, "sysvgroups"}, - {Opt_resgid, "resgid=%u"}, - {Opt_resuid, "resuid=%u"}, - {Opt_sb, "sb=%u"}, - {Opt_err_cont, "errors=continue"}, - {Opt_err_panic, "errors=panic"}, - {Opt_err_ro, "errors=remount-ro"}, - {Opt_nouid32, "nouid32"}, - {Opt_nocheck, "nocheck"}, - {Opt_nocheck, "check=none"}, - {Opt_debug, "debug"}, - {Opt_oldalloc, "oldalloc"}, - {Opt_orlov, "orlov"}, - {Opt_user_xattr, "user_xattr"}, - {Opt_nouser_xattr, "nouser_xattr"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_reservation, "reservation"}, - {Opt_noreservation, "noreservation"}, - {Opt_noload, "noload"}, - {Opt_noload, "norecovery"}, - {Opt_nobh, "nobh"}, - {Opt_bh, "bh"}, - {Opt_commit, "commit=%u"}, - {Opt_journal_update, "journal=update"}, - {Opt_journal_inum, "journal=%u"}, - {Opt_journal_dev, "journal_dev=%u"}, - {Opt_journal_path, "journal_path=%s"}, - {Opt_abort, "abort"}, - {Opt_data_journal, "data=journal"}, - {Opt_data_ordered, "data=ordered"}, - {Opt_data_writeback, "data=writeback"}, - {Opt_data_err_abort, "data_err=abort"}, - {Opt_data_err_ignore, "data_err=ignore"}, - {Opt_offusrjquota, "usrjquota="}, - {Opt_usrjquota, "usrjquota=%s"}, - {Opt_offgrpjquota, "grpjquota="}, - {Opt_grpjquota, "grpjquota=%s"}, - {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, - {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, - {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, - {Opt_grpquota, "grpquota"}, - {Opt_noquota, "noquota"}, - {Opt_quota, "quota"}, - {Opt_usrquota, "usrquota"}, - {Opt_barrier, "barrier=%u"}, - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_resize, "resize"}, - {Opt_err, NULL}, -}; - -static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb) -{ - ext3_fsblk_t sb_block; - char *options = (char *) *data; - - if (!options || strncmp(options, "sb=", 3) != 0) - return 1; /* Default location */ - options += 3; - /*todo: use simple_strtoll with >32bit ext3 */ - sb_block = simple_strtoul(options, &options, 0); - if (*options && *options != ',') { - ext3_msg(sb, KERN_ERR, "error: invalid sb specification: %s", - (char *) *data); - return 1; - } - if (*options == ',') - options++; - *data = (void *) options; - return sb_block; -} - -#ifdef CONFIG_QUOTA -static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - char *qname; - - if (sb_any_quota_loaded(sb) && - !sbi->s_qf_names[qtype]) { - ext3_msg(sb, KERN_ERR, - "Cannot change journaled " - "quota options when quota turned on"); - return 0; - } - qname = match_strdup(args); - if (!qname) { - ext3_msg(sb, KERN_ERR, - "Not enough memory for storing quotafile name"); - return 0; - } - if (sbi->s_qf_names[qtype]) { - int same = !strcmp(sbi->s_qf_names[qtype], qname); - - kfree(qname); - if (!same) { - ext3_msg(sb, KERN_ERR, - "%s quota file already specified", - QTYPE2NAME(qtype)); - } - return same; - } - if (strchr(qname, '/')) { - ext3_msg(sb, KERN_ERR, - "quotafile must be on filesystem root"); - kfree(qname); - return 0; - } - sbi->s_qf_names[qtype] = qname; - set_opt(sbi->s_mount_opt, QUOTA); - return 1; -} - -static int clear_qf_name(struct super_block *sb, int qtype) { - - struct ext3_sb_info *sbi = EXT3_SB(sb); - - if (sb_any_quota_loaded(sb) && - sbi->s_qf_names[qtype]) { - ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options" - " when quota turned on"); - return 0; - } - if (sbi->s_qf_names[qtype]) { - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; - } - return 1; -} -#endif - -static int parse_options (char *options, struct super_block *sb, - unsigned int *inum, unsigned long *journal_devnum, - ext3_fsblk_t *n_blocks_count, int is_remount) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - char * p; - substring_t args[MAX_OPT_ARGS]; - int data_opt = 0; - int option; - kuid_t uid; - kgid_t gid; - char *journal_path; - struct inode *journal_inode; - struct path path; - int error; - -#ifdef CONFIG_QUOTA - int qfmt; -#endif - - if (!options) - return 1; - - while ((p = strsep (&options, ",")) != NULL) { - int token; - if (!*p) - continue; - /* - * Initialize args struct so we know whether arg was - * found; some options take optional arguments. - */ - args[0].to = args[0].from = NULL; - token = match_token(p, tokens, args); - switch (token) { - case Opt_bsd_df: - clear_opt (sbi->s_mount_opt, MINIX_DF); - break; - case Opt_minix_df: - set_opt (sbi->s_mount_opt, MINIX_DF); - break; - case Opt_grpid: - set_opt (sbi->s_mount_opt, GRPID); - break; - case Opt_nogrpid: - clear_opt (sbi->s_mount_opt, GRPID); - break; - case Opt_resuid: - if (match_int(&args[0], &option)) - return 0; - uid = make_kuid(current_user_ns(), option); - if (!uid_valid(uid)) { - ext3_msg(sb, KERN_ERR, "Invalid uid value %d", option); - return 0; - - } - sbi->s_resuid = uid; - break; - case Opt_resgid: - if (match_int(&args[0], &option)) - return 0; - gid = make_kgid(current_user_ns(), option); - if (!gid_valid(gid)) { - ext3_msg(sb, KERN_ERR, "Invalid gid value %d", option); - return 0; - } - sbi->s_resgid = gid; - break; - case Opt_sb: - /* handled by get_sb_block() instead of here */ - /* *sb_block = match_int(&args[0]); */ - break; - case Opt_err_panic: - clear_opt (sbi->s_mount_opt, ERRORS_CONT); - clear_opt (sbi->s_mount_opt, ERRORS_RO); - set_opt (sbi->s_mount_opt, ERRORS_PANIC); - break; - case Opt_err_ro: - clear_opt (sbi->s_mount_opt, ERRORS_CONT); - clear_opt (sbi->s_mount_opt, ERRORS_PANIC); - set_opt (sbi->s_mount_opt, ERRORS_RO); - break; - case Opt_err_cont: - clear_opt (sbi->s_mount_opt, ERRORS_RO); - clear_opt (sbi->s_mount_opt, ERRORS_PANIC); - set_opt (sbi->s_mount_opt, ERRORS_CONT); - break; - case Opt_nouid32: - set_opt (sbi->s_mount_opt, NO_UID32); - break; - case Opt_nocheck: - clear_opt (sbi->s_mount_opt, CHECK); - break; - case Opt_debug: - set_opt (sbi->s_mount_opt, DEBUG); - break; - case Opt_oldalloc: - ext3_msg(sb, KERN_WARNING, - "Ignoring deprecated oldalloc option"); - break; - case Opt_orlov: - ext3_msg(sb, KERN_WARNING, - "Ignoring deprecated orlov option"); - break; -#ifdef CONFIG_EXT3_FS_XATTR - case Opt_user_xattr: - set_opt (sbi->s_mount_opt, XATTR_USER); - break; - case Opt_nouser_xattr: - clear_opt (sbi->s_mount_opt, XATTR_USER); - break; -#else - case Opt_user_xattr: - case Opt_nouser_xattr: - ext3_msg(sb, KERN_INFO, - "(no)user_xattr options not supported"); - break; -#endif -#ifdef CONFIG_EXT3_FS_POSIX_ACL - case Opt_acl: - set_opt(sbi->s_mount_opt, POSIX_ACL); - break; - case Opt_noacl: - clear_opt(sbi->s_mount_opt, POSIX_ACL); - break; -#else - case Opt_acl: - case Opt_noacl: - ext3_msg(sb, KERN_INFO, - "(no)acl options not supported"); - break; -#endif - case Opt_reservation: - set_opt(sbi->s_mount_opt, RESERVATION); - break; - case Opt_noreservation: - clear_opt(sbi->s_mount_opt, RESERVATION); - break; - case Opt_journal_update: - /* @@@ FIXME */ - /* Eventually we will want to be able to create - a journal file here. For now, only allow the - user to specify an existing inode to be the - journal file. */ - if (is_remount) { - ext3_msg(sb, KERN_ERR, "error: cannot specify " - "journal on remount"); - return 0; - } - set_opt (sbi->s_mount_opt, UPDATE_JOURNAL); - break; - case Opt_journal_inum: - if (is_remount) { - ext3_msg(sb, KERN_ERR, "error: cannot specify " - "journal on remount"); - return 0; - } - if (match_int(&args[0], &option)) - return 0; - *inum = option; - break; - case Opt_journal_dev: - if (is_remount) { - ext3_msg(sb, KERN_ERR, "error: cannot specify " - "journal on remount"); - return 0; - } - if (match_int(&args[0], &option)) - return 0; - *journal_devnum = option; - break; - case Opt_journal_path: - if (is_remount) { - ext3_msg(sb, KERN_ERR, "error: cannot specify " - "journal on remount"); - return 0; - } - - journal_path = match_strdup(&args[0]); - if (!journal_path) { - ext3_msg(sb, KERN_ERR, "error: could not dup " - "journal device string"); - return 0; - } - - error = kern_path(journal_path, LOOKUP_FOLLOW, &path); - if (error) { - ext3_msg(sb, KERN_ERR, "error: could not find " - "journal device path: error %d", error); - kfree(journal_path); - return 0; - } - - journal_inode = d_inode(path.dentry); - if (!S_ISBLK(journal_inode->i_mode)) { - ext3_msg(sb, KERN_ERR, "error: journal path %s " - "is not a block device", journal_path); - path_put(&path); - kfree(journal_path); - return 0; - } - - *journal_devnum = new_encode_dev(journal_inode->i_rdev); - path_put(&path); - kfree(journal_path); - break; - case Opt_noload: - set_opt (sbi->s_mount_opt, NOLOAD); - break; - case Opt_commit: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - if (option == 0) - option = JBD_DEFAULT_MAX_COMMIT_AGE; - sbi->s_commit_interval = HZ * option; - break; - case Opt_data_journal: - data_opt = EXT3_MOUNT_JOURNAL_DATA; - goto datacheck; - case Opt_data_ordered: - data_opt = EXT3_MOUNT_ORDERED_DATA; - goto datacheck; - case Opt_data_writeback: - data_opt = EXT3_MOUNT_WRITEBACK_DATA; - datacheck: - if (is_remount) { - if (test_opt(sb, DATA_FLAGS) == data_opt) - break; - ext3_msg(sb, KERN_ERR, - "error: cannot change " - "data mode on remount. The filesystem " - "is mounted in data=%s mode and you " - "try to remount it in data=%s mode.", - data_mode_string(test_opt(sb, - DATA_FLAGS)), - data_mode_string(data_opt)); - return 0; - } else { - clear_opt(sbi->s_mount_opt, DATA_FLAGS); - sbi->s_mount_opt |= data_opt; - } - break; - case Opt_data_err_abort: - set_opt(sbi->s_mount_opt, DATA_ERR_ABORT); - break; - case Opt_data_err_ignore: - clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); - break; -#ifdef CONFIG_QUOTA - case Opt_usrjquota: - if (!set_qf_name(sb, USRQUOTA, &args[0])) - return 0; - break; - case Opt_grpjquota: - if (!set_qf_name(sb, GRPQUOTA, &args[0])) - return 0; - break; - case Opt_offusrjquota: - if (!clear_qf_name(sb, USRQUOTA)) - return 0; - break; - case Opt_offgrpjquota: - if (!clear_qf_name(sb, GRPQUOTA)) - return 0; - break; - case Opt_jqfmt_vfsold: - qfmt = QFMT_VFS_OLD; - goto set_qf_format; - case Opt_jqfmt_vfsv0: - qfmt = QFMT_VFS_V0; - goto set_qf_format; - case Opt_jqfmt_vfsv1: - qfmt = QFMT_VFS_V1; -set_qf_format: - if (sb_any_quota_loaded(sb) && - sbi->s_jquota_fmt != qfmt) { - ext3_msg(sb, KERN_ERR, "error: cannot change " - "journaled quota options when " - "quota turned on."); - return 0; - } - sbi->s_jquota_fmt = qfmt; - break; - case Opt_quota: - case Opt_usrquota: - set_opt(sbi->s_mount_opt, QUOTA); - set_opt(sbi->s_mount_opt, USRQUOTA); - break; - case Opt_grpquota: - set_opt(sbi->s_mount_opt, QUOTA); - set_opt(sbi->s_mount_opt, GRPQUOTA); - break; - case Opt_noquota: - if (sb_any_quota_loaded(sb)) { - ext3_msg(sb, KERN_ERR, "error: cannot change " - "quota options when quota turned on."); - return 0; - } - clear_opt(sbi->s_mount_opt, QUOTA); - clear_opt(sbi->s_mount_opt, USRQUOTA); - clear_opt(sbi->s_mount_opt, GRPQUOTA); - break; -#else - case Opt_quota: - case Opt_usrquota: - case Opt_grpquota: - ext3_msg(sb, KERN_ERR, - "error: quota options not supported."); - break; - case Opt_usrjquota: - case Opt_grpjquota: - case Opt_offusrjquota: - case Opt_offgrpjquota: - case Opt_jqfmt_vfsold: - case Opt_jqfmt_vfsv0: - case Opt_jqfmt_vfsv1: - ext3_msg(sb, KERN_ERR, - "error: journaled quota options not " - "supported."); - break; - case Opt_noquota: - break; -#endif - case Opt_abort: - set_opt(sbi->s_mount_opt, ABORT); - break; - case Opt_nobarrier: - clear_opt(sbi->s_mount_opt, BARRIER); - break; - case Opt_barrier: - if (args[0].from) { - if (match_int(&args[0], &option)) - return 0; - } else - option = 1; /* No argument, default to 1 */ - if (option) - set_opt(sbi->s_mount_opt, BARRIER); - else - clear_opt(sbi->s_mount_opt, BARRIER); - break; - case Opt_ignore: - break; - case Opt_resize: - if (!is_remount) { - ext3_msg(sb, KERN_ERR, - "error: resize option only available " - "for remount"); - return 0; - } - if (match_int(&args[0], &option) != 0) - return 0; - *n_blocks_count = option; - break; - case Opt_nobh: - ext3_msg(sb, KERN_WARNING, - "warning: ignoring deprecated nobh option"); - break; - case Opt_bh: - ext3_msg(sb, KERN_WARNING, - "warning: ignoring deprecated bh option"); - break; - default: - ext3_msg(sb, KERN_ERR, - "error: unrecognized mount option \"%s\" " - "or missing value", p); - return 0; - } - } -#ifdef CONFIG_QUOTA - if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { - if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) - clear_opt(sbi->s_mount_opt, USRQUOTA); - if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) - clear_opt(sbi->s_mount_opt, GRPQUOTA); - - if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { - ext3_msg(sb, KERN_ERR, "error: old and new quota " - "format mixing."); - return 0; - } - - if (!sbi->s_jquota_fmt) { - ext3_msg(sb, KERN_ERR, "error: journaled quota format " - "not specified."); - return 0; - } - } -#endif - return 1; -} - -static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, - int read_only) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - int res = 0; - - if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) { - ext3_msg(sb, KERN_ERR, - "error: revision level too high, " - "forcing read-only mode"); - res = MS_RDONLY; - } - if (read_only) - return res; - if (!(sbi->s_mount_state & EXT3_VALID_FS)) - ext3_msg(sb, KERN_WARNING, - "warning: mounting unchecked fs, " - "running e2fsck is recommended"); - else if ((sbi->s_mount_state & EXT3_ERROR_FS)) - ext3_msg(sb, KERN_WARNING, - "warning: mounting fs with errors, " - "running e2fsck is recommended"); - else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && - le16_to_cpu(es->s_mnt_count) >= - le16_to_cpu(es->s_max_mnt_count)) - ext3_msg(sb, KERN_WARNING, - "warning: maximal mount count reached, " - "running e2fsck is recommended"); - else if (le32_to_cpu(es->s_checkinterval) && - (le32_to_cpu(es->s_lastcheck) + - le32_to_cpu(es->s_checkinterval) <= get_seconds())) - ext3_msg(sb, KERN_WARNING, - "warning: checktime reached, " - "running e2fsck is recommended"); -#if 0 - /* @@@ We _will_ want to clear the valid bit if we find - inconsistencies, to force a fsck at reboot. But for - a plain journaled filesystem we can keep it set as - valid forever! :) */ - es->s_state &= cpu_to_le16(~EXT3_VALID_FS); -#endif - if (!le16_to_cpu(es->s_max_mnt_count)) - es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT); - le16_add_cpu(&es->s_mnt_count, 1); - es->s_mtime = cpu_to_le32(get_seconds()); - ext3_update_dynamic_rev(sb); - EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); - - ext3_commit_super(sb, es, 1); - if (test_opt(sb, DEBUG)) - ext3_msg(sb, KERN_INFO, "[bs=%lu, gc=%lu, " - "bpg=%lu, ipg=%lu, mo=%04lx]", - sb->s_blocksize, - sbi->s_groups_count, - EXT3_BLOCKS_PER_GROUP(sb), - EXT3_INODES_PER_GROUP(sb), - sbi->s_mount_opt); - - if (EXT3_SB(sb)->s_journal->j_inode == NULL) { - char b[BDEVNAME_SIZE]; - ext3_msg(sb, KERN_INFO, "using external journal on %s", - bdevname(EXT3_SB(sb)->s_journal->j_dev, b)); - } else { - ext3_msg(sb, KERN_INFO, "using internal journal"); - } - cleancache_init_fs(sb); - return res; -} - -/* Called at mount-time, super-block is locked */ -static int ext3_check_descriptors(struct super_block *sb) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - int i; - - ext3_debug ("Checking group descriptors"); - - for (i = 0; i < sbi->s_groups_count; i++) { - struct ext3_group_desc *gdp = ext3_get_group_desc(sb, i, NULL); - ext3_fsblk_t first_block = ext3_group_first_block_no(sb, i); - ext3_fsblk_t last_block; - - if (i == sbi->s_groups_count - 1) - last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1; - else - last_block = first_block + - (EXT3_BLOCKS_PER_GROUP(sb) - 1); - - if (le32_to_cpu(gdp->bg_block_bitmap) < first_block || - le32_to_cpu(gdp->bg_block_bitmap) > last_block) - { - ext3_error (sb, "ext3_check_descriptors", - "Block bitmap for group %d" - " not in group (block %lu)!", - i, (unsigned long) - le32_to_cpu(gdp->bg_block_bitmap)); - return 0; - } - if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block || - le32_to_cpu(gdp->bg_inode_bitmap) > last_block) - { - ext3_error (sb, "ext3_check_descriptors", - "Inode bitmap for group %d" - " not in group (block %lu)!", - i, (unsigned long) - le32_to_cpu(gdp->bg_inode_bitmap)); - return 0; - } - if (le32_to_cpu(gdp->bg_inode_table) < first_block || - le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group - 1 > - last_block) - { - ext3_error (sb, "ext3_check_descriptors", - "Inode table for group %d" - " not in group (block %lu)!", - i, (unsigned long) - le32_to_cpu(gdp->bg_inode_table)); - return 0; - } - } - - sbi->s_es->s_free_blocks_count=cpu_to_le32(ext3_count_free_blocks(sb)); - sbi->s_es->s_free_inodes_count=cpu_to_le32(ext3_count_free_inodes(sb)); - return 1; -} - - -/* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at - * the superblock) which were deleted from all directories, but held open by - * a process at the time of a crash. We walk the list and try to delete these - * inodes at recovery time (only with a read-write filesystem). - * - * In order to keep the orphan inode chain consistent during traversal (in - * case of crash during recovery), we link each inode into the superblock - * orphan list_head and handle it the same way as an inode deletion during - * normal operation (which journals the operations for us). - * - * We only do an iget() and an iput() on each inode, which is very safe if we - * accidentally point at an in-use or already deleted inode. The worst that - * can happen in this case is that we get a "bit already cleared" message from - * ext3_free_inode(). The only reason we would point at a wrong inode is if - * e2fsck was run on this filesystem, and it must have already done the orphan - * inode cleanup for us, so we can safely abort without any further action. - */ -static void ext3_orphan_cleanup (struct super_block * sb, - struct ext3_super_block * es) -{ - unsigned int s_flags = sb->s_flags; - int nr_orphans = 0, nr_truncates = 0; -#ifdef CONFIG_QUOTA - int i; -#endif - if (!es->s_last_orphan) { - jbd_debug(4, "no orphan inodes to clean up\n"); - return; - } - - if (bdev_read_only(sb->s_bdev)) { - ext3_msg(sb, KERN_ERR, "error: write access " - "unavailable, skipping orphan cleanup."); - return; - } - - /* Check if feature set allows readwrite operations */ - if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) { - ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " - "unknown ROCOMPAT features"); - return; - } - - if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) { - /* don't clear list on RO mount w/ errors */ - if (es->s_last_orphan && !(s_flags & MS_RDONLY)) { - jbd_debug(1, "Errors on filesystem, " - "clearing orphan list.\n"); - es->s_last_orphan = 0; - } - jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); - return; - } - - if (s_flags & MS_RDONLY) { - ext3_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); - sb->s_flags &= ~MS_RDONLY; - } -#ifdef CONFIG_QUOTA - /* Needed for iput() to work correctly and not trash data */ - sb->s_flags |= MS_ACTIVE; - /* Turn on quotas so that they are updated correctly */ - for (i = 0; i < EXT3_MAXQUOTAS; i++) { - if (EXT3_SB(sb)->s_qf_names[i]) { - int ret = ext3_quota_on_mount(sb, i); - if (ret < 0) - ext3_msg(sb, KERN_ERR, - "error: cannot turn on journaled " - "quota: %d", ret); - } - } -#endif - - while (es->s_last_orphan) { - struct inode *inode; - - inode = ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); - if (IS_ERR(inode)) { - es->s_last_orphan = 0; - break; - } - - list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); - dquot_initialize(inode); - if (inode->i_nlink) { - printk(KERN_DEBUG - "%s: truncating inode %lu to %Ld bytes\n", - __func__, inode->i_ino, inode->i_size); - jbd_debug(2, "truncating inode %lu to %Ld bytes\n", - inode->i_ino, inode->i_size); - ext3_truncate(inode); - nr_truncates++; - } else { - printk(KERN_DEBUG - "%s: deleting unreferenced inode %lu\n", - __func__, inode->i_ino); - jbd_debug(2, "deleting unreferenced inode %lu\n", - inode->i_ino); - nr_orphans++; - } - iput(inode); /* The delete magic happens here! */ - } - -#define PLURAL(x) (x), ((x)==1) ? "" : "s" - - if (nr_orphans) - ext3_msg(sb, KERN_INFO, "%d orphan inode%s deleted", - PLURAL(nr_orphans)); - if (nr_truncates) - ext3_msg(sb, KERN_INFO, "%d truncate%s cleaned up", - PLURAL(nr_truncates)); -#ifdef CONFIG_QUOTA - /* Turn quotas off */ - for (i = 0; i < EXT3_MAXQUOTAS; i++) { - if (sb_dqopt(sb)->files[i]) - dquot_quota_off(sb, i); - } -#endif - sb->s_flags = s_flags; /* Restore MS_RDONLY status */ -} - -/* - * Maximal file size. There is a direct, and {,double-,triple-}indirect - * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks. - * We need to be 1 filesystem block less than the 2^32 sector limit. - */ -static loff_t ext3_max_size(int bits) -{ - loff_t res = EXT3_NDIR_BLOCKS; - int meta_blocks; - loff_t upper_limit; - - /* This is calculated to be the largest file size for a - * dense, file such that the total number of - * sectors in the file, including data and all indirect blocks, - * does not exceed 2^32 -1 - * __u32 i_blocks representing the total number of - * 512 bytes blocks of the file - */ - upper_limit = (1LL << 32) - 1; - - /* total blocks in file system block size */ - upper_limit >>= (bits - 9); - - - /* indirect blocks */ - meta_blocks = 1; - /* double indirect blocks */ - meta_blocks += 1 + (1LL << (bits-2)); - /* tripple indirect blocks */ - meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); - - upper_limit -= meta_blocks; - upper_limit <<= bits; - - res += 1LL << (bits-2); - res += 1LL << (2*(bits-2)); - res += 1LL << (3*(bits-2)); - res <<= bits; - if (res > upper_limit) - res = upper_limit; - - if (res > MAX_LFS_FILESIZE) - res = MAX_LFS_FILESIZE; - - return res; -} - -static ext3_fsblk_t descriptor_loc(struct super_block *sb, - ext3_fsblk_t logic_sb_block, - int nr) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - unsigned long bg, first_meta_bg; - int has_super = 0; - - first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); - - if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) || - nr < first_meta_bg) - return (logic_sb_block + nr + 1); - bg = sbi->s_desc_per_block * nr; - if (ext3_bg_has_super(sb, bg)) - has_super = 1; - return (has_super + ext3_group_first_block_no(sb, bg)); -} - - -static int ext3_fill_super (struct super_block *sb, void *data, int silent) -{ - struct buffer_head * bh; - struct ext3_super_block *es = NULL; - struct ext3_sb_info *sbi; - ext3_fsblk_t block; - ext3_fsblk_t sb_block = get_sb_block(&data, sb); - ext3_fsblk_t logic_sb_block; - unsigned long offset = 0; - unsigned int journal_inum = 0; - unsigned long journal_devnum = 0; - unsigned long def_mount_opts; - struct inode *root; - int blocksize; - int hblock; - int db_count; - int i; - int needs_recovery; - int ret = -EINVAL; - __le32 features; - int err; - - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); - if (!sbi) - return -ENOMEM; - - sbi->s_blockgroup_lock = - kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); - if (!sbi->s_blockgroup_lock) { - kfree(sbi); - return -ENOMEM; - } - sb->s_fs_info = sbi; - sbi->s_sb_block = sb_block; - - blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); - if (!blocksize) { - ext3_msg(sb, KERN_ERR, "error: unable to set blocksize"); - goto out_fail; - } - - /* - * The ext3 superblock will not be buffer aligned for other than 1kB - * block sizes. We need to calculate the offset from buffer start. - */ - if (blocksize != EXT3_MIN_BLOCK_SIZE) { - logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; - offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; - } else { - logic_sb_block = sb_block; - } - - if (!(bh = sb_bread(sb, logic_sb_block))) { - ext3_msg(sb, KERN_ERR, "error: unable to read superblock"); - goto out_fail; - } - /* - * Note: s_es must be initialized as soon as possible because - * some ext3 macro-instructions depend on its value - */ - es = (struct ext3_super_block *) (bh->b_data + offset); - sbi->s_es = es; - sb->s_magic = le16_to_cpu(es->s_magic); - if (sb->s_magic != EXT3_SUPER_MAGIC) - goto cantfind_ext3; - - /* Set defaults before we parse the mount options */ - def_mount_opts = le32_to_cpu(es->s_default_mount_opts); - if (def_mount_opts & EXT3_DEFM_DEBUG) - set_opt(sbi->s_mount_opt, DEBUG); - if (def_mount_opts & EXT3_DEFM_BSDGROUPS) - set_opt(sbi->s_mount_opt, GRPID); - if (def_mount_opts & EXT3_DEFM_UID16) - set_opt(sbi->s_mount_opt, NO_UID32); -#ifdef CONFIG_EXT3_FS_XATTR - if (def_mount_opts & EXT3_DEFM_XATTR_USER) - set_opt(sbi->s_mount_opt, XATTR_USER); -#endif -#ifdef CONFIG_EXT3_FS_POSIX_ACL - if (def_mount_opts & EXT3_DEFM_ACL) - set_opt(sbi->s_mount_opt, POSIX_ACL); -#endif - if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA) - set_opt(sbi->s_mount_opt, JOURNAL_DATA); - else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED) - set_opt(sbi->s_mount_opt, ORDERED_DATA); - else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK) - set_opt(sbi->s_mount_opt, WRITEBACK_DATA); - - if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC) - set_opt(sbi->s_mount_opt, ERRORS_PANIC); - else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_CONTINUE) - set_opt(sbi->s_mount_opt, ERRORS_CONT); - else - set_opt(sbi->s_mount_opt, ERRORS_RO); - - sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); - sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); - - /* enable barriers by default */ - set_opt(sbi->s_mount_opt, BARRIER); - set_opt(sbi->s_mount_opt, RESERVATION); - - if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, - NULL, 0)) - goto failed_mount; - - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); - - if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV && - (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) || - EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) || - EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U))) - ext3_msg(sb, KERN_WARNING, - "warning: feature flags set on rev 0 fs, " - "running e2fsck is recommended"); - /* - * Check feature flags regardless of the revision level, since we - * previously didn't change the revision level when setting the flags, - * so there is a chance incompat flags are set on a rev 0 filesystem. - */ - features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP); - if (features) { - ext3_msg(sb, KERN_ERR, - "error: couldn't mount because of unsupported " - "optional features (%x)", le32_to_cpu(features)); - goto failed_mount; - } - features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP); - if (!(sb->s_flags & MS_RDONLY) && features) { - ext3_msg(sb, KERN_ERR, - "error: couldn't mount RDWR because of unsupported " - "optional features (%x)", le32_to_cpu(features)); - goto failed_mount; - } - blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); - - if (blocksize < EXT3_MIN_BLOCK_SIZE || - blocksize > EXT3_MAX_BLOCK_SIZE) { - ext3_msg(sb, KERN_ERR, - "error: couldn't mount because of unsupported " - "filesystem blocksize %d", blocksize); - goto failed_mount; - } - - hblock = bdev_logical_block_size(sb->s_bdev); - if (sb->s_blocksize != blocksize) { - /* - * Make sure the blocksize for the filesystem is larger - * than the hardware sectorsize for the machine. - */ - if (blocksize < hblock) { - ext3_msg(sb, KERN_ERR, - "error: fsblocksize %d too small for " - "hardware sectorsize %d", blocksize, hblock); - goto failed_mount; - } - - brelse (bh); - if (!sb_set_blocksize(sb, blocksize)) { - ext3_msg(sb, KERN_ERR, - "error: bad blocksize %d", blocksize); - goto out_fail; - } - logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; - offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; - bh = sb_bread(sb, logic_sb_block); - if (!bh) { - ext3_msg(sb, KERN_ERR, - "error: can't read superblock on 2nd try"); - goto failed_mount; - } - es = (struct ext3_super_block *)(bh->b_data + offset); - sbi->s_es = es; - if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) { - ext3_msg(sb, KERN_ERR, - "error: magic mismatch"); - goto failed_mount; - } - } - - sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits); - - if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) { - sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE; - sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO; - } else { - sbi->s_inode_size = le16_to_cpu(es->s_inode_size); - sbi->s_first_ino = le32_to_cpu(es->s_first_ino); - if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) || - (!is_power_of_2(sbi->s_inode_size)) || - (sbi->s_inode_size > blocksize)) { - ext3_msg(sb, KERN_ERR, - "error: unsupported inode size: %d", - sbi->s_inode_size); - goto failed_mount; - } - } - sbi->s_frag_size = EXT3_MIN_FRAG_SIZE << - le32_to_cpu(es->s_log_frag_size); - if (blocksize != sbi->s_frag_size) { - ext3_msg(sb, KERN_ERR, - "error: fragsize %lu != blocksize %u (unsupported)", - sbi->s_frag_size, blocksize); - goto failed_mount; - } - sbi->s_frags_per_block = 1; - sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); - sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group); - sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); - if (EXT3_INODE_SIZE(sb) == 0 || EXT3_INODES_PER_GROUP(sb) == 0) - goto cantfind_ext3; - sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb); - if (sbi->s_inodes_per_block == 0) - goto cantfind_ext3; - sbi->s_itb_per_group = sbi->s_inodes_per_group / - sbi->s_inodes_per_block; - sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc); - sbi->s_sbh = bh; - sbi->s_mount_state = le16_to_cpu(es->s_state); - sbi->s_addr_per_block_bits = ilog2(EXT3_ADDR_PER_BLOCK(sb)); - sbi->s_desc_per_block_bits = ilog2(EXT3_DESC_PER_BLOCK(sb)); - for (i = 0; i < 4; i++) - sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); - sbi->s_def_hash_version = es->s_def_hash_version; - i = le32_to_cpu(es->s_flags); - if (i & EXT2_FLAGS_UNSIGNED_HASH) - sbi->s_hash_unsigned = 3; - else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { -#ifdef __CHAR_UNSIGNED__ - es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); - sbi->s_hash_unsigned = 3; -#else - es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); -#endif - } - - if (sbi->s_blocks_per_group > blocksize * 8) { - ext3_msg(sb, KERN_ERR, - "#blocks per group too big: %lu", - sbi->s_blocks_per_group); - goto failed_mount; - } - if (sbi->s_frags_per_group > blocksize * 8) { - ext3_msg(sb, KERN_ERR, - "error: #fragments per group too big: %lu", - sbi->s_frags_per_group); - goto failed_mount; - } - if (sbi->s_inodes_per_group > blocksize * 8) { - ext3_msg(sb, KERN_ERR, - "error: #inodes per group too big: %lu", - sbi->s_inodes_per_group); - goto failed_mount; - } - - err = generic_check_addressable(sb->s_blocksize_bits, - le32_to_cpu(es->s_blocks_count)); - if (err) { - ext3_msg(sb, KERN_ERR, - "error: filesystem is too large to mount safely"); - if (sizeof(sector_t) < 8) - ext3_msg(sb, KERN_ERR, - "error: CONFIG_LBDAF not enabled"); - ret = err; - goto failed_mount; - } - - if (EXT3_BLOCKS_PER_GROUP(sb) == 0) - goto cantfind_ext3; - sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - - le32_to_cpu(es->s_first_data_block) - 1) - / EXT3_BLOCKS_PER_GROUP(sb)) + 1; - db_count = DIV_ROUND_UP(sbi->s_groups_count, EXT3_DESC_PER_BLOCK(sb)); - sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), - GFP_KERNEL); - if (sbi->s_group_desc == NULL) { - ext3_msg(sb, KERN_ERR, - "error: not enough memory"); - ret = -ENOMEM; - goto failed_mount; - } - - bgl_lock_init(sbi->s_blockgroup_lock); - - for (i = 0; i < db_count; i++) { - block = descriptor_loc(sb, logic_sb_block, i); - sbi->s_group_desc[i] = sb_bread(sb, block); - if (!sbi->s_group_desc[i]) { - ext3_msg(sb, KERN_ERR, - "error: can't read group descriptor %d", i); - db_count = i; - goto failed_mount2; - } - } - if (!ext3_check_descriptors (sb)) { - ext3_msg(sb, KERN_ERR, - "error: group descriptors corrupted"); - goto failed_mount2; - } - sbi->s_gdb_count = db_count; - get_random_bytes(&sbi->s_next_generation, sizeof(u32)); - spin_lock_init(&sbi->s_next_gen_lock); - - /* per fileystem reservation list head & lock */ - spin_lock_init(&sbi->s_rsv_window_lock); - sbi->s_rsv_window_root = RB_ROOT; - /* Add a single, static dummy reservation to the start of the - * reservation window list --- it gives us a placeholder for - * append-at-start-of-list which makes the allocation logic - * _much_ simpler. */ - sbi->s_rsv_window_head.rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; - sbi->s_rsv_window_head.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; - sbi->s_rsv_window_head.rsv_alloc_hit = 0; - sbi->s_rsv_window_head.rsv_goal_size = 0; - ext3_rsv_window_add(sb, &sbi->s_rsv_window_head); - - /* - * set up enough so that it can read an inode - */ - sb->s_op = &ext3_sops; - sb->s_export_op = &ext3_export_ops; - sb->s_xattr = ext3_xattr_handlers; -#ifdef CONFIG_QUOTA - sb->s_qcop = &ext3_qctl_operations; - sb->dq_op = &ext3_quota_operations; - sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; -#endif - memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); - INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ - mutex_init(&sbi->s_orphan_lock); - mutex_init(&sbi->s_resize_lock); - - sb->s_root = NULL; - - needs_recovery = (es->s_last_orphan != 0 || - EXT3_HAS_INCOMPAT_FEATURE(sb, - EXT3_FEATURE_INCOMPAT_RECOVER)); - - /* - * The first inode we look at is the journal inode. Don't try - * root first: it may be modified in the journal! - */ - if (!test_opt(sb, NOLOAD) && - EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { - if (ext3_load_journal(sb, es, journal_devnum)) - goto failed_mount2; - } else if (journal_inum) { - if (ext3_create_journal(sb, es, journal_inum)) - goto failed_mount2; - } else { - if (!silent) - ext3_msg(sb, KERN_ERR, - "error: no journal found. " - "mounting ext3 over ext2?"); - goto failed_mount2; - } - err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext3_count_free_blocks(sb), GFP_KERNEL); - if (!err) { - err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext3_count_free_inodes(sb), GFP_KERNEL); - } - if (!err) { - err = percpu_counter_init(&sbi->s_dirs_counter, - ext3_count_dirs(sb), GFP_KERNEL); - } - if (err) { - ext3_msg(sb, KERN_ERR, "error: insufficient memory"); - ret = err; - goto failed_mount3; - } - - /* We have now updated the journal if required, so we can - * validate the data journaling mode. */ - switch (test_opt(sb, DATA_FLAGS)) { - case 0: - /* No mode set, assume a default based on the journal - capabilities: ORDERED_DATA if the journal can - cope, else JOURNAL_DATA */ - if (journal_check_available_features - (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) - set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE); - else - set_opt(sbi->s_mount_opt, JOURNAL_DATA); - break; - - case EXT3_MOUNT_ORDERED_DATA: - case EXT3_MOUNT_WRITEBACK_DATA: - if (!journal_check_available_features - (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) { - ext3_msg(sb, KERN_ERR, - "error: journal does not support " - "requested data journaling mode"); - goto failed_mount3; - } - default: - break; - } - - /* - * The journal_load will have done any necessary log recovery, - * so we can safely mount the rest of the filesystem now. - */ - - root = ext3_iget(sb, EXT3_ROOT_INO); - if (IS_ERR(root)) { - ext3_msg(sb, KERN_ERR, "error: get root inode failed"); - ret = PTR_ERR(root); - goto failed_mount3; - } - if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { - iput(root); - ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck"); - goto failed_mount3; - } - sb->s_root = d_make_root(root); - if (!sb->s_root) { - ext3_msg(sb, KERN_ERR, "error: get root dentry failed"); - ret = -ENOMEM; - goto failed_mount3; - } - - if (ext3_setup_super(sb, es, sb->s_flags & MS_RDONLY)) - sb->s_flags |= MS_RDONLY; - - EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; - ext3_orphan_cleanup(sb, es); - EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; - if (needs_recovery) { - ext3_mark_recovery_complete(sb, es); - ext3_msg(sb, KERN_INFO, "recovery complete"); - } - ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode", - test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": - test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": - "writeback"); - - return 0; - -cantfind_ext3: - if (!silent) - ext3_msg(sb, KERN_INFO, - "error: can't find ext3 filesystem on dev %s.", - sb->s_id); - goto failed_mount; - -failed_mount3: - percpu_counter_destroy(&sbi->s_freeblocks_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - journal_destroy(sbi->s_journal); -failed_mount2: - for (i = 0; i < db_count; i++) - brelse(sbi->s_group_desc[i]); - kfree(sbi->s_group_desc); -failed_mount: -#ifdef CONFIG_QUOTA - for (i = 0; i < EXT3_MAXQUOTAS; i++) - kfree(sbi->s_qf_names[i]); -#endif - ext3_blkdev_remove(sbi); - brelse(bh); -out_fail: - sb->s_fs_info = NULL; - kfree(sbi->s_blockgroup_lock); - kfree(sbi); - return ret; -} - -/* - * Setup any per-fs journal parameters now. We'll do this both on - * initial mount, once the journal has been initialised but before we've - * done any recovery; and again on any subsequent remount. - */ -static void ext3_init_journal_params(struct super_block *sb, journal_t *journal) -{ - struct ext3_sb_info *sbi = EXT3_SB(sb); - - if (sbi->s_commit_interval) - journal->j_commit_interval = sbi->s_commit_interval; - /* We could also set up an ext3-specific default for the commit - * interval here, but for now we'll just fall back to the jbd - * default. */ - - spin_lock(&journal->j_state_lock); - if (test_opt(sb, BARRIER)) - journal->j_flags |= JFS_BARRIER; - else - journal->j_flags &= ~JFS_BARRIER; - if (test_opt(sb, DATA_ERR_ABORT)) - journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR; - else - journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR; - spin_unlock(&journal->j_state_lock); -} - -static journal_t *ext3_get_journal(struct super_block *sb, - unsigned int journal_inum) -{ - struct inode *journal_inode; - journal_t *journal; - - /* First, test for the existence of a valid inode on disk. Bad - * things happen if we iget() an unused inode, as the subsequent - * iput() will try to delete it. */ - - journal_inode = ext3_iget(sb, journal_inum); - if (IS_ERR(journal_inode)) { - ext3_msg(sb, KERN_ERR, "error: no journal found"); - return NULL; - } - if (!journal_inode->i_nlink) { - make_bad_inode(journal_inode); - iput(journal_inode); - ext3_msg(sb, KERN_ERR, "error: journal inode is deleted"); - return NULL; - } - - jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", - journal_inode, journal_inode->i_size); - if (!S_ISREG(journal_inode->i_mode)) { - ext3_msg(sb, KERN_ERR, "error: invalid journal inode"); - iput(journal_inode); - return NULL; - } - - journal = journal_init_inode(journal_inode); - if (!journal) { - ext3_msg(sb, KERN_ERR, "error: could not load journal inode"); - iput(journal_inode); - return NULL; - } - journal->j_private = sb; - ext3_init_journal_params(sb, journal); - return journal; -} - -static journal_t *ext3_get_dev_journal(struct super_block *sb, - dev_t j_dev) -{ - struct buffer_head * bh; - journal_t *journal; - ext3_fsblk_t start; - ext3_fsblk_t len; - int hblock, blocksize; - ext3_fsblk_t sb_block; - unsigned long offset; - struct ext3_super_block * es; - struct block_device *bdev; - - bdev = ext3_blkdev_get(j_dev, sb); - if (bdev == NULL) - return NULL; - - blocksize = sb->s_blocksize; - hblock = bdev_logical_block_size(bdev); - if (blocksize < hblock) { - ext3_msg(sb, KERN_ERR, - "error: blocksize too small for journal device"); - goto out_bdev; - } - - sb_block = EXT3_MIN_BLOCK_SIZE / blocksize; - offset = EXT3_MIN_BLOCK_SIZE % blocksize; - set_blocksize(bdev, blocksize); - if (!(bh = __bread(bdev, sb_block, blocksize))) { - ext3_msg(sb, KERN_ERR, "error: couldn't read superblock of " - "external journal"); - goto out_bdev; - } - - es = (struct ext3_super_block *) (bh->b_data + offset); - if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) || - !(le32_to_cpu(es->s_feature_incompat) & - EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) { - ext3_msg(sb, KERN_ERR, "error: external journal has " - "bad superblock"); - brelse(bh); - goto out_bdev; - } - - if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { - ext3_msg(sb, KERN_ERR, "error: journal UUID does not match"); - brelse(bh); - goto out_bdev; - } - - len = le32_to_cpu(es->s_blocks_count); - start = sb_block + 1; - brelse(bh); /* we're done with the superblock */ - - journal = journal_init_dev(bdev, sb->s_bdev, - start, len, blocksize); - if (!journal) { - ext3_msg(sb, KERN_ERR, - "error: failed to create device journal"); - goto out_bdev; - } - journal->j_private = sb; - if (!bh_uptodate_or_lock(journal->j_sb_buffer)) { - if (bh_submit_read(journal->j_sb_buffer)) { - ext3_msg(sb, KERN_ERR, "I/O error on journal device"); - goto out_journal; - } - } - if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { - ext3_msg(sb, KERN_ERR, - "error: external journal has more than one " - "user (unsupported) - %d", - be32_to_cpu(journal->j_superblock->s_nr_users)); - goto out_journal; - } - EXT3_SB(sb)->journal_bdev = bdev; - ext3_init_journal_params(sb, journal); - return journal; -out_journal: - journal_destroy(journal); -out_bdev: - ext3_blkdev_put(bdev); - return NULL; -} - -static int ext3_load_journal(struct super_block *sb, - struct ext3_super_block *es, - unsigned long journal_devnum) -{ - journal_t *journal; - unsigned int journal_inum = le32_to_cpu(es->s_journal_inum); - dev_t journal_dev; - int err = 0; - int really_read_only; - - if (journal_devnum && - journal_devnum != le32_to_cpu(es->s_journal_dev)) { - ext3_msg(sb, KERN_INFO, "external journal device major/minor " - "numbers have changed"); - journal_dev = new_decode_dev(journal_devnum); - } else - journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); - - really_read_only = bdev_read_only(sb->s_bdev); - - /* - * Are we loading a blank journal or performing recovery after a - * crash? For recovery, we need to check in advance whether we - * can get read-write access to the device. - */ - - if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) { - if (sb->s_flags & MS_RDONLY) { - ext3_msg(sb, KERN_INFO, - "recovery required on readonly filesystem"); - if (really_read_only) { - ext3_msg(sb, KERN_ERR, "error: write access " - "unavailable, cannot proceed"); - return -EROFS; - } - ext3_msg(sb, KERN_INFO, - "write access will be enabled during recovery"); - } - } - - if (journal_inum && journal_dev) { - ext3_msg(sb, KERN_ERR, "error: filesystem has both journal " - "and inode journals"); - return -EINVAL; - } - - if (journal_inum) { - if (!(journal = ext3_get_journal(sb, journal_inum))) - return -EINVAL; - } else { - if (!(journal = ext3_get_dev_journal(sb, journal_dev))) - return -EINVAL; - } - - if (!(journal->j_flags & JFS_BARRIER)) - printk(KERN_INFO "EXT3-fs: barriers not enabled\n"); - - if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { - err = journal_update_format(journal); - if (err) { - ext3_msg(sb, KERN_ERR, "error updating journal"); - journal_destroy(journal); - return err; - } - } - - if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) - err = journal_wipe(journal, !really_read_only); - if (!err) - err = journal_load(journal); - - if (err) { - ext3_msg(sb, KERN_ERR, "error loading journal"); - journal_destroy(journal); - return err; - } - - EXT3_SB(sb)->s_journal = journal; - ext3_clear_journal_err(sb, es); - - if (!really_read_only && journal_devnum && - journal_devnum != le32_to_cpu(es->s_journal_dev)) { - es->s_journal_dev = cpu_to_le32(journal_devnum); - - /* Make sure we flush the recovery flag to disk. */ - ext3_commit_super(sb, es, 1); - } - - return 0; -} - -static int ext3_create_journal(struct super_block *sb, - struct ext3_super_block *es, - unsigned int journal_inum) -{ - journal_t *journal; - int err; - - if (sb->s_flags & MS_RDONLY) { - ext3_msg(sb, KERN_ERR, - "error: readonly filesystem when trying to " - "create journal"); - return -EROFS; - } - - journal = ext3_get_journal(sb, journal_inum); - if (!journal) - return -EINVAL; - - ext3_msg(sb, KERN_INFO, "creating new journal on inode %u", - journal_inum); - - err = journal_create(journal); - if (err) { - ext3_msg(sb, KERN_ERR, "error creating journal"); - journal_destroy(journal); - return -EIO; - } - - EXT3_SB(sb)->s_journal = journal; - - ext3_update_dynamic_rev(sb); - EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); - EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL); - - es->s_journal_inum = cpu_to_le32(journal_inum); - - /* Make sure we flush the recovery flag to disk. */ - ext3_commit_super(sb, es, 1); - - return 0; -} - -static int ext3_commit_super(struct super_block *sb, - struct ext3_super_block *es, - int sync) -{ - struct buffer_head *sbh = EXT3_SB(sb)->s_sbh; - int error = 0; - - if (!sbh) - return error; - - if (buffer_write_io_error(sbh)) { - /* - * Oh, dear. A previous attempt to write the - * superblock failed. This could happen because the - * USB device was yanked out. Or it could happen to - * be a transient write error and maybe the block will - * be remapped. Nothing we can do but to retry the - * write and hope for the best. - */ - ext3_msg(sb, KERN_ERR, "previous I/O error to " - "superblock detected"); - clear_buffer_write_io_error(sbh); - set_buffer_uptodate(sbh); - } - /* - * If the file system is mounted read-only, don't update the - * superblock write time. This avoids updating the superblock - * write time when we are mounting the root file system - * read/only but we need to replay the journal; at that point, - * for people who are east of GMT and who make their clock - * tick in localtime for Windows bug-for-bug compatibility, - * the clock is set in the future, and this will cause e2fsck - * to complain and force a full file system check. - */ - if (!(sb->s_flags & MS_RDONLY)) - es->s_wtime = cpu_to_le32(get_seconds()); - es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); - es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); - BUFFER_TRACE(sbh, "marking dirty"); - mark_buffer_dirty(sbh); - if (sync) { - error = sync_dirty_buffer(sbh); - if (buffer_write_io_error(sbh)) { - ext3_msg(sb, KERN_ERR, "I/O error while writing " - "superblock"); - clear_buffer_write_io_error(sbh); - set_buffer_uptodate(sbh); - } - } - return error; -} - - -/* - * Have we just finished recovery? If so, and if we are mounting (or - * remounting) the filesystem readonly, then we will end up with a - * consistent fs on disk. Record that fact. - */ -static void ext3_mark_recovery_complete(struct super_block * sb, - struct ext3_super_block * es) -{ - journal_t *journal = EXT3_SB(sb)->s_journal; - - journal_lock_updates(journal); - if (journal_flush(journal) < 0) - goto out; - - if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && - sb->s_flags & MS_RDONLY) { - EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); - ext3_commit_super(sb, es, 1); - } - -out: - journal_unlock_updates(journal); -} - -/* - * If we are mounting (or read-write remounting) a filesystem whose journal - * has recorded an error from a previous lifetime, move that error to the - * main filesystem now. - */ -static void ext3_clear_journal_err(struct super_block *sb, - struct ext3_super_block *es) -{ - journal_t *journal; - int j_errno; - const char *errstr; - - journal = EXT3_SB(sb)->s_journal; - - /* - * Now check for any error status which may have been recorded in the - * journal by a prior ext3_error() or ext3_abort() - */ - - j_errno = journal_errno(journal); - if (j_errno) { - char nbuf[16]; - - errstr = ext3_decode_error(sb, j_errno, nbuf); - ext3_warning(sb, __func__, "Filesystem error recorded " - "from previous mount: %s", errstr); - ext3_warning(sb, __func__, "Marking fs in need of " - "filesystem check."); - - EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; - es->s_state |= cpu_to_le16(EXT3_ERROR_FS); - ext3_commit_super (sb, es, 1); - - journal_clear_err(journal); - } -} - -/* - * Force the running and committing transactions to commit, - * and wait on the commit. - */ -int ext3_force_commit(struct super_block *sb) -{ - journal_t *journal; - int ret; - - if (sb->s_flags & MS_RDONLY) - return 0; - - journal = EXT3_SB(sb)->s_journal; - ret = ext3_journal_force_commit(journal); - return ret; -} - -static int ext3_sync_fs(struct super_block *sb, int wait) -{ - tid_t target; - - trace_ext3_sync_fs(sb, wait); - /* - * Writeback quota in non-journalled quota case - journalled quota has - * no dirty dquots - */ - dquot_writeback_dquots(sb, -1); - if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) { - if (wait) - log_wait_commit(EXT3_SB(sb)->s_journal, target); - } - return 0; -} - -/* - * LVM calls this function before a (read-only) snapshot is created. This - * gives us a chance to flush the journal completely and mark the fs clean. - */ -static int ext3_freeze(struct super_block *sb) -{ - int error = 0; - journal_t *journal; - - if (!(sb->s_flags & MS_RDONLY)) { - journal = EXT3_SB(sb)->s_journal; - - /* Now we set up the journal barrier. */ - journal_lock_updates(journal); - - /* - * We don't want to clear needs_recovery flag when we failed - * to flush the journal. - */ - error = journal_flush(journal); - if (error < 0) - goto out; - - /* Journal blocked and flushed, clear needs_recovery flag. */ - EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); - error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); - if (error) - goto out; - } - return 0; - -out: - journal_unlock_updates(journal); - return error; -} - -/* - * Called by LVM after the snapshot is done. We need to reset the RECOVER - * flag here, even though the filesystem is not technically dirty yet. - */ -static int ext3_unfreeze(struct super_block *sb) -{ - if (!(sb->s_flags & MS_RDONLY)) { - /* Reser the needs_recovery flag before the fs is unlocked. */ - EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); - ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); - journal_unlock_updates(EXT3_SB(sb)->s_journal); - } - return 0; -} - -static int ext3_remount (struct super_block * sb, int * flags, char * data) -{ - struct ext3_super_block * es; - struct ext3_sb_info *sbi = EXT3_SB(sb); - ext3_fsblk_t n_blocks_count = 0; - unsigned long old_sb_flags; - struct ext3_mount_options old_opts; - int enable_quota = 0; - int err; -#ifdef CONFIG_QUOTA - int i; -#endif - - sync_filesystem(sb); - - /* Store the original options */ - old_sb_flags = sb->s_flags; - old_opts.s_mount_opt = sbi->s_mount_opt; - old_opts.s_resuid = sbi->s_resuid; - old_opts.s_resgid = sbi->s_resgid; - old_opts.s_commit_interval = sbi->s_commit_interval; -#ifdef CONFIG_QUOTA - old_opts.s_jquota_fmt = sbi->s_jquota_fmt; - for (i = 0; i < EXT3_MAXQUOTAS; i++) - if (sbi->s_qf_names[i]) { - old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], - GFP_KERNEL); - if (!old_opts.s_qf_names[i]) { - int j; - - for (j = 0; j < i; j++) - kfree(old_opts.s_qf_names[j]); - return -ENOMEM; - } - } else - old_opts.s_qf_names[i] = NULL; -#endif - - /* - * Allow the "check" option to be passed as a remount option. - */ - if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) { - err = -EINVAL; - goto restore_opts; - } - - if (test_opt(sb, ABORT)) - ext3_abort(sb, __func__, "Abort forced by user"); - - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); - - es = sbi->s_es; - - ext3_init_journal_params(sb, sbi->s_journal); - - if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || - n_blocks_count > le32_to_cpu(es->s_blocks_count)) { - if (test_opt(sb, ABORT)) { - err = -EROFS; - goto restore_opts; - } - - if (*flags & MS_RDONLY) { - err = dquot_suspend(sb, -1); - if (err < 0) - goto restore_opts; - - /* - * First of all, the unconditional stuff we have to do - * to disable replay of the journal when we next remount - */ - sb->s_flags |= MS_RDONLY; - - /* - * OK, test if we are remounting a valid rw partition - * readonly, and if so set the rdonly flag and then - * mark the partition as valid again. - */ - if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) && - (sbi->s_mount_state & EXT3_VALID_FS)) - es->s_state = cpu_to_le16(sbi->s_mount_state); - - ext3_mark_recovery_complete(sb, es); - } else { - __le32 ret; - if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb, - ~EXT3_FEATURE_RO_COMPAT_SUPP))) { - ext3_msg(sb, KERN_WARNING, - "warning: couldn't remount RDWR " - "because of unsupported optional " - "features (%x)", le32_to_cpu(ret)); - err = -EROFS; - goto restore_opts; - } - - /* - * If we have an unprocessed orphan list hanging - * around from a previously readonly bdev mount, - * require a full umount & mount for now. - */ - if (es->s_last_orphan) { - ext3_msg(sb, KERN_WARNING, "warning: couldn't " - "remount RDWR because of unprocessed " - "orphan inode list. Please " - "umount & mount instead."); - err = -EINVAL; - goto restore_opts; - } - - /* - * Mounting a RDONLY partition read-write, so reread - * and store the current valid flag. (It may have - * been changed by e2fsck since we originally mounted - * the partition.) - */ - ext3_clear_journal_err(sb, es); - sbi->s_mount_state = le16_to_cpu(es->s_state); - if ((err = ext3_group_extend(sb, es, n_blocks_count))) - goto restore_opts; - if (!ext3_setup_super (sb, es, 0)) - sb->s_flags &= ~MS_RDONLY; - enable_quota = 1; - } - } -#ifdef CONFIG_QUOTA - /* Release old quota file names */ - for (i = 0; i < EXT3_MAXQUOTAS; i++) - kfree(old_opts.s_qf_names[i]); -#endif - if (enable_quota) - dquot_resume(sb, -1); - return 0; -restore_opts: - sb->s_flags = old_sb_flags; - sbi->s_mount_opt = old_opts.s_mount_opt; - sbi->s_resuid = old_opts.s_resuid; - sbi->s_resgid = old_opts.s_resgid; - sbi->s_commit_interval = old_opts.s_commit_interval; -#ifdef CONFIG_QUOTA - sbi->s_jquota_fmt = old_opts.s_jquota_fmt; - for (i = 0; i < EXT3_MAXQUOTAS; i++) { - kfree(sbi->s_qf_names[i]); - sbi->s_qf_names[i] = old_opts.s_qf_names[i]; - } -#endif - return err; -} - -static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) -{ - struct super_block *sb = dentry->d_sb; - struct ext3_sb_info *sbi = EXT3_SB(sb); - struct ext3_super_block *es = sbi->s_es; - u64 fsid; - - if (test_opt(sb, MINIX_DF)) { - sbi->s_overhead_last = 0; - } else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) { - unsigned long ngroups = sbi->s_groups_count, i; - ext3_fsblk_t overhead = 0; - smp_rmb(); - - /* - * Compute the overhead (FS structures). This is constant - * for a given filesystem unless the number of block groups - * changes so we cache the previous value until it does. - */ - - /* - * All of the blocks before first_data_block are - * overhead - */ - overhead = le32_to_cpu(es->s_first_data_block); - - /* - * Add the overhead attributed to the superblock and - * block group descriptors. If the sparse superblocks - * feature is turned on, then not all groups have this. - */ - for (i = 0; i < ngroups; i++) { - overhead += ext3_bg_has_super(sb, i) + - ext3_bg_num_gdb(sb, i); - cond_resched(); - } - - /* - * Every block group has an inode bitmap, a block - * bitmap, and an inode table. - */ - overhead += ngroups * (2 + sbi->s_itb_per_group); - - /* Add the internal journal blocks as well */ - if (sbi->s_journal && !sbi->journal_bdev) - overhead += sbi->s_journal->j_maxlen; - - sbi->s_overhead_last = overhead; - smp_wmb(); - sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count); - } - - buf->f_type = EXT3_SUPER_MAGIC; - buf->f_bsize = sb->s_blocksize; - buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last; - buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); - buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); - if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) - buf->f_bavail = 0; - buf->f_files = le32_to_cpu(es->s_inodes_count); - buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); - buf->f_namelen = EXT3_NAME_LEN; - fsid = le64_to_cpup((void *)es->s_uuid) ^ - le64_to_cpup((void *)es->s_uuid + sizeof(u64)); - buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; - buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; - return 0; -} - -/* Helper function for writing quotas on sync - we need to start transaction before quota file - * is locked for write. Otherwise the are possible deadlocks: - * Process 1 Process 2 - * ext3_create() quota_sync() - * journal_start() write_dquot() - * dquot_initialize() down(dqio_mutex) - * down(dqio_mutex) journal_start() - * - */ - -#ifdef CONFIG_QUOTA - -static inline struct inode *dquot_to_inode(struct dquot *dquot) -{ - return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type]; -} - -static int ext3_write_dquot(struct dquot *dquot) -{ - int ret, err; - handle_t *handle; - struct inode *inode; - - inode = dquot_to_inode(dquot); - handle = ext3_journal_start(inode, - EXT3_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - ret = dquot_commit(dquot); - err = ext3_journal_stop(handle); - if (!ret) - ret = err; - return ret; -} - -static int ext3_acquire_dquot(struct dquot *dquot) -{ - int ret, err; - handle_t *handle; - - handle = ext3_journal_start(dquot_to_inode(dquot), - EXT3_QUOTA_INIT_BLOCKS(dquot->dq_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - ret = dquot_acquire(dquot); - err = ext3_journal_stop(handle); - if (!ret) - ret = err; - return ret; -} - -static int ext3_release_dquot(struct dquot *dquot) -{ - int ret, err; - handle_t *handle; - - handle = ext3_journal_start(dquot_to_inode(dquot), - EXT3_QUOTA_DEL_BLOCKS(dquot->dq_sb)); - if (IS_ERR(handle)) { - /* Release dquot anyway to avoid endless cycle in dqput() */ - dquot_release(dquot); - return PTR_ERR(handle); - } - ret = dquot_release(dquot); - err = ext3_journal_stop(handle); - if (!ret) - ret = err; - return ret; -} - -static int ext3_mark_dquot_dirty(struct dquot *dquot) -{ - /* Are we journaling quotas? */ - if (EXT3_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || - EXT3_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { - dquot_mark_dquot_dirty(dquot); - return ext3_write_dquot(dquot); - } else { - return dquot_mark_dquot_dirty(dquot); - } -} - -static int ext3_write_info(struct super_block *sb, int type) -{ - int ret, err; - handle_t *handle; - - /* Data block + inode block */ - handle = ext3_journal_start(d_inode(sb->s_root), 2); - if (IS_ERR(handle)) - return PTR_ERR(handle); - ret = dquot_commit_info(sb, type); - err = ext3_journal_stop(handle); - if (!ret) - ret = err; - return ret; -} - -/* - * Turn on quotas during mount time - we need to find - * the quota file and such... - */ -static int ext3_quota_on_mount(struct super_block *sb, int type) -{ - return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type], - EXT3_SB(sb)->s_jquota_fmt, type); -} - -/* - * Standard function to be called on quota_on - */ -static int ext3_quota_on(struct super_block *sb, int type, int format_id, - struct path *path) -{ - int err; - - if (!test_opt(sb, QUOTA)) - return -EINVAL; - - /* Quotafile not on the same filesystem? */ - if (path->dentry->d_sb != sb) - return -EXDEV; - /* Journaling quota? */ - if (EXT3_SB(sb)->s_qf_names[type]) { - /* Quotafile not of fs root? */ - if (path->dentry->d_parent != sb->s_root) - ext3_msg(sb, KERN_WARNING, - "warning: Quota file not on filesystem root. " - "Journaled quota will not work."); - } - - /* - * When we journal data on quota file, we have to flush journal to see - * all updates to the file when we bypass pagecache... - */ - if (ext3_should_journal_data(d_inode(path->dentry))) { - /* - * We don't need to lock updates but journal_flush() could - * otherwise be livelocked... - */ - journal_lock_updates(EXT3_SB(sb)->s_journal); - err = journal_flush(EXT3_SB(sb)->s_journal); - journal_unlock_updates(EXT3_SB(sb)->s_journal); - if (err) - return err; - } - - return dquot_quota_on(sb, type, format_id, path); -} - -/* Read data from quotafile - avoid pagecache and such because we cannot afford - * acquiring the locks... As quota files are never truncated and quota code - * itself serializes the operations (and no one else should touch the files) - * we don't have to be afraid of races */ -static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, - size_t len, loff_t off) -{ - struct inode *inode = sb_dqopt(sb)->files[type]; - sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb); - int err = 0; - int offset = off & (sb->s_blocksize - 1); - int tocopy; - size_t toread; - struct buffer_head *bh; - loff_t i_size = i_size_read(inode); - - if (off > i_size) - return 0; - if (off+len > i_size) - len = i_size-off; - toread = len; - while (toread > 0) { - tocopy = sb->s_blocksize - offset < toread ? - sb->s_blocksize - offset : toread; - bh = ext3_bread(NULL, inode, blk, 0, &err); - if (err) - return err; - if (!bh) /* A hole? */ - memset(data, 0, tocopy); - else - memcpy(data, bh->b_data+offset, tocopy); - brelse(bh); - offset = 0; - toread -= tocopy; - data += tocopy; - blk++; - } - return len; -} - -/* Write to quotafile (we know the transaction is already started and has - * enough credits) */ -static ssize_t ext3_quota_write(struct super_block *sb, int type, - const char *data, size_t len, loff_t off) -{ - struct inode *inode = sb_dqopt(sb)->files[type]; - sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb); - int err = 0; - int offset = off & (sb->s_blocksize - 1); - int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL; - struct buffer_head *bh; - handle_t *handle = journal_current_handle(); - - if (!handle) { - ext3_msg(sb, KERN_WARNING, - "warning: quota write (off=%llu, len=%llu)" - " cancelled because transaction is not started.", - (unsigned long long)off, (unsigned long long)len); - return -EIO; - } - - /* - * Since we account only one data block in transaction credits, - * then it is impossible to cross a block boundary. - */ - if (sb->s_blocksize - offset < len) { - ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" - " cancelled because not block aligned", - (unsigned long long)off, (unsigned long long)len); - return -EIO; - } - bh = ext3_bread(handle, inode, blk, 1, &err); - if (!bh) - goto out; - if (journal_quota) { - err = ext3_journal_get_write_access(handle, bh); - if (err) { - brelse(bh); - goto out; - } - } - lock_buffer(bh); - memcpy(bh->b_data+offset, data, len); - flush_dcache_page(bh->b_page); - unlock_buffer(bh); - if (journal_quota) - err = ext3_journal_dirty_metadata(handle, bh); - else { - /* Always do at least ordered writes for quotas */ - err = ext3_journal_dirty_data(handle, bh); - mark_buffer_dirty(bh); - } - brelse(bh); -out: - if (err) - return err; - if (inode->i_size < off + len) { - i_size_write(inode, off + len); - EXT3_I(inode)->i_disksize = inode->i_size; - } - inode->i_version++; - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - ext3_mark_inode_dirty(handle, inode); - return len; -} - -#endif - -static struct dentry *ext3_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super); -} - -static struct file_system_type ext3_fs_type = { - .owner = THIS_MODULE, - .name = "ext3", - .mount = ext3_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; -MODULE_ALIAS_FS("ext3"); - -static int __init init_ext3_fs(void) -{ - int err = init_ext3_xattr(); - if (err) - return err; - err = init_inodecache(); - if (err) - goto out1; - err = register_filesystem(&ext3_fs_type); - if (err) - goto out; - return 0; -out: - destroy_inodecache(); -out1: - exit_ext3_xattr(); - return err; -} - -static void __exit exit_ext3_fs(void) -{ - unregister_filesystem(&ext3_fs_type); - destroy_inodecache(); - exit_ext3_xattr(); -} - -MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); -MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); -MODULE_LICENSE("GPL"); -module_init(init_ext3_fs) -module_exit(exit_ext3_fs) diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c deleted file mode 100644 index c08c59094ae6..000000000000 --- a/fs/ext3/symlink.c +++ /dev/null @@ -1,46 +0,0 @@ -/* - * linux/fs/ext3/symlink.c - * - * Only fast symlinks left here - the rest is done by generic code. AV, 1999 - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/symlink.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * ext3 symlink handling code - */ - -#include "ext3.h" -#include "xattr.h" - -const struct inode_operations ext3_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, - .setattr = ext3_setattr, -#ifdef CONFIG_EXT3_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .listxattr = ext3_listxattr, - .removexattr = generic_removexattr, -#endif -}; - -const struct inode_operations ext3_fast_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = simple_follow_link, - .setattr = ext3_setattr, -#ifdef CONFIG_EXT3_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .listxattr = ext3_listxattr, - .removexattr = generic_removexattr, -#endif -}; diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c deleted file mode 100644 index 7cf36501ccf4..000000000000 --- a/fs/ext3/xattr.c +++ /dev/null @@ -1,1330 +0,0 @@ -/* - * linux/fs/ext3/xattr.c - * - * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> - * - * Fix by Harrison Xing <harrison@mountainviewdata.com>. - * Ext3 code with a lot of help from Eric Jarman <ejarman@acm.org>. - * Extended attributes for symlinks and special files added per - * suggestion of Luka Renko <luka.renko@hermes.si>. - * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>, - * Red Hat Inc. - * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz - * and Andreas Gruenbacher <agruen@suse.de>. - */ - -/* - * Extended attributes are stored directly in inodes (on file systems with - * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl - * field contains the block number if an inode uses an additional block. All - * attributes must fit in the inode and one additional block. Blocks that - * contain the identical set of attributes may be shared among several inodes. - * Identical blocks are detected by keeping a cache of blocks that have - * recently been accessed. - * - * The attributes in inodes and on blocks have a different header; the entries - * are stored in the same format: - * - * +------------------+ - * | header | - * | entry 1 | | - * | entry 2 | | growing downwards - * | entry 3 | v - * | four null bytes | - * | . . . | - * | value 1 | ^ - * | value 3 | | growing upwards - * | value 2 | | - * +------------------+ - * - * The header is followed by multiple entry descriptors. In disk blocks, the - * entry descriptors are kept sorted. In inodes, they are unsorted. The - * attribute values are aligned to the end of the block in no specific order. - * - * Locking strategy - * ---------------- - * EXT3_I(inode)->i_file_acl is protected by EXT3_I(inode)->xattr_sem. - * EA blocks are only changed if they are exclusive to an inode, so - * holding xattr_sem also means that nothing but the EA block's reference - * count can change. Multiple writers to the same block are synchronized - * by the buffer lock. - */ - -#include "ext3.h" -#include <linux/mbcache.h> -#include <linux/quotaops.h> -#include "xattr.h" -#include "acl.h" - -#define BHDR(bh) ((struct ext3_xattr_header *)((bh)->b_data)) -#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr)) -#define BFIRST(bh) ENTRY(BHDR(bh)+1) -#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) - -#define IHDR(inode, raw_inode) \ - ((struct ext3_xattr_ibody_header *) \ - ((void *)raw_inode + \ - EXT3_GOOD_OLD_INODE_SIZE + \ - EXT3_I(inode)->i_extra_isize)) -#define IFIRST(hdr) ((struct ext3_xattr_entry *)((hdr)+1)) - -#ifdef EXT3_XATTR_DEBUG -# define ea_idebug(inode, f...) do { \ - printk(KERN_DEBUG "inode %s:%lu: ", \ - inode->i_sb->s_id, inode->i_ino); \ - printk(f); \ - printk("\n"); \ - } while (0) -# define ea_bdebug(bh, f...) do { \ - char b[BDEVNAME_SIZE]; \ - printk(KERN_DEBUG "block %s:%lu: ", \ - bdevname(bh->b_bdev, b), \ - (unsigned long) bh->b_blocknr); \ - printk(f); \ - printk("\n"); \ - } while (0) -#else -# define ea_idebug(f...) -# define ea_bdebug(f...) -#endif - -static void ext3_xattr_cache_insert(struct buffer_head *); -static struct buffer_head *ext3_xattr_cache_find(struct inode *, - struct ext3_xattr_header *, - struct mb_cache_entry **); -static void ext3_xattr_rehash(struct ext3_xattr_header *, - struct ext3_xattr_entry *); -static int ext3_xattr_list(struct dentry *dentry, char *buffer, - size_t buffer_size); - -static struct mb_cache *ext3_xattr_cache; - -static const struct xattr_handler *ext3_xattr_handler_map[] = { - [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler, -#ifdef CONFIG_EXT3_FS_POSIX_ACL - [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, - [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, -#endif - [EXT3_XATTR_INDEX_TRUSTED] = &ext3_xattr_trusted_handler, -#ifdef CONFIG_EXT3_FS_SECURITY - [EXT3_XATTR_INDEX_SECURITY] = &ext3_xattr_security_handler, -#endif -}; - -const struct xattr_handler *ext3_xattr_handlers[] = { - &ext3_xattr_user_handler, - &ext3_xattr_trusted_handler, -#ifdef CONFIG_EXT3_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif -#ifdef CONFIG_EXT3_FS_SECURITY - &ext3_xattr_security_handler, -#endif - NULL -}; - -static inline const struct xattr_handler * -ext3_xattr_handler(int name_index) -{ - const struct xattr_handler *handler = NULL; - - if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map)) - handler = ext3_xattr_handler_map[name_index]; - return handler; -} - -/* - * Inode operation listxattr() - * - * d_inode(dentry)->i_mutex: don't care - */ -ssize_t -ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ - return ext3_xattr_list(dentry, buffer, size); -} - -static int -ext3_xattr_check_names(struct ext3_xattr_entry *entry, void *end) -{ - while (!IS_LAST_ENTRY(entry)) { - struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(entry); - if ((void *)next >= end) - return -EIO; - entry = next; - } - return 0; -} - -static inline int -ext3_xattr_check_block(struct buffer_head *bh) -{ - int error; - - if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || - BHDR(bh)->h_blocks != cpu_to_le32(1)) - return -EIO; - error = ext3_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); - return error; -} - -static inline int -ext3_xattr_check_entry(struct ext3_xattr_entry *entry, size_t size) -{ - size_t value_size = le32_to_cpu(entry->e_value_size); - - if (entry->e_value_block != 0 || value_size > size || - le16_to_cpu(entry->e_value_offs) + value_size > size) - return -EIO; - return 0; -} - -static int -ext3_xattr_find_entry(struct ext3_xattr_entry **pentry, int name_index, - const char *name, size_t size, int sorted) -{ - struct ext3_xattr_entry *entry; - size_t name_len; - int cmp = 1; - - if (name == NULL) - return -EINVAL; - name_len = strlen(name); - entry = *pentry; - for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) { - cmp = name_index - entry->e_name_index; - if (!cmp) - cmp = name_len - entry->e_name_len; - if (!cmp) - cmp = memcmp(name, entry->e_name, name_len); - if (cmp <= 0 && (sorted || cmp == 0)) - break; - } - *pentry = entry; - if (!cmp && ext3_xattr_check_entry(entry, size)) - return -EIO; - return cmp ? -ENODATA : 0; -} - -static int -ext3_xattr_block_get(struct inode *inode, int name_index, const char *name, - void *buffer, size_t buffer_size) -{ - struct buffer_head *bh = NULL; - struct ext3_xattr_entry *entry; - size_t size; - int error; - - ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", - name_index, name, buffer, (long)buffer_size); - - error = -ENODATA; - if (!EXT3_I(inode)->i_file_acl) - goto cleanup; - ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl); - bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); - if (!bh) - goto cleanup; - ea_bdebug(bh, "b_count=%d, refcount=%d", - atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); - if (ext3_xattr_check_block(bh)) { -bad_block: ext3_error(inode->i_sb, __func__, - "inode %lu: bad block "E3FSBLK, inode->i_ino, - EXT3_I(inode)->i_file_acl); - error = -EIO; - goto cleanup; - } - ext3_xattr_cache_insert(bh); - entry = BFIRST(bh); - error = ext3_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); - if (error == -EIO) - goto bad_block; - if (error) - goto cleanup; - size = le32_to_cpu(entry->e_value_size); - if (buffer) { - error = -ERANGE; - if (size > buffer_size) - goto cleanup; - memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), - size); - } - error = size; - -cleanup: - brelse(bh); - return error; -} - -static int -ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name, - void *buffer, size_t buffer_size) -{ - struct ext3_xattr_ibody_header *header; - struct ext3_xattr_entry *entry; - struct ext3_inode *raw_inode; - struct ext3_iloc iloc; - size_t size; - void *end; - int error; - - if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR)) - return -ENODATA; - error = ext3_get_inode_loc(inode, &iloc); - if (error) - return error; - raw_inode = ext3_raw_inode(&iloc); - header = IHDR(inode, raw_inode); - entry = IFIRST(header); - end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; - error = ext3_xattr_check_names(entry, end); - if (error) - goto cleanup; - error = ext3_xattr_find_entry(&entry, name_index, name, - end - (void *)entry, 0); - if (error) - goto cleanup; - size = le32_to_cpu(entry->e_value_size); - if (buffer) { - error = -ERANGE; - if (size > buffer_size) - goto cleanup; - memcpy(buffer, (void *)IFIRST(header) + - le16_to_cpu(entry->e_value_offs), size); - } - error = size; - -cleanup: - brelse(iloc.bh); - return error; -} - -/* - * ext3_xattr_get() - * - * Copy an extended attribute into the buffer - * provided, or compute the buffer size required. - * Buffer is NULL to compute the size of the buffer required. - * - * Returns a negative error number on failure, or the number of bytes - * used / required on success. - */ -int -ext3_xattr_get(struct inode *inode, int name_index, const char *name, - void *buffer, size_t buffer_size) -{ - int error; - - down_read(&EXT3_I(inode)->xattr_sem); - error = ext3_xattr_ibody_get(inode, name_index, name, buffer, - buffer_size); - if (error == -ENODATA) - error = ext3_xattr_block_get(inode, name_index, name, buffer, - buffer_size); - up_read(&EXT3_I(inode)->xattr_sem); - return error; -} - -static int -ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry, - char *buffer, size_t buffer_size) -{ - size_t rest = buffer_size; - - for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) { - const struct xattr_handler *handler = - ext3_xattr_handler(entry->e_name_index); - - if (handler) { - size_t size = handler->list(dentry, buffer, rest, - entry->e_name, - entry->e_name_len, - handler->flags); - if (buffer) { - if (size > rest) - return -ERANGE; - buffer += size; - } - rest -= size; - } - } - return buffer_size - rest; -} - -static int -ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) -{ - struct inode *inode = d_inode(dentry); - struct buffer_head *bh = NULL; - int error; - - ea_idebug(inode, "buffer=%p, buffer_size=%ld", - buffer, (long)buffer_size); - - error = 0; - if (!EXT3_I(inode)->i_file_acl) - goto cleanup; - ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl); - bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); - error = -EIO; - if (!bh) - goto cleanup; - ea_bdebug(bh, "b_count=%d, refcount=%d", - atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); - if (ext3_xattr_check_block(bh)) { - ext3_error(inode->i_sb, __func__, - "inode %lu: bad block "E3FSBLK, inode->i_ino, - EXT3_I(inode)->i_file_acl); - error = -EIO; - goto cleanup; - } - ext3_xattr_cache_insert(bh); - error = ext3_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); - -cleanup: - brelse(bh); - - return error; -} - -static int -ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) -{ - struct inode *inode = d_inode(dentry); - struct ext3_xattr_ibody_header *header; - struct ext3_inode *raw_inode; - struct ext3_iloc iloc; - void *end; - int error; - - if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR)) - return 0; - error = ext3_get_inode_loc(inode, &iloc); - if (error) - return error; - raw_inode = ext3_raw_inode(&iloc); - header = IHDR(inode, raw_inode); - end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; - error = ext3_xattr_check_names(IFIRST(header), end); - if (error) - goto cleanup; - error = ext3_xattr_list_entries(dentry, IFIRST(header), - buffer, buffer_size); - -cleanup: - brelse(iloc.bh); - return error; -} - -/* - * ext3_xattr_list() - * - * Copy a list of attribute names into the buffer - * provided, or compute the buffer size required. - * Buffer is NULL to compute the size of the buffer required. - * - * Returns a negative error number on failure, or the number of bytes - * used / required on success. - */ -static int -ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) -{ - int i_error, b_error; - - down_read(&EXT3_I(d_inode(dentry))->xattr_sem); - i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size); - if (i_error < 0) { - b_error = 0; - } else { - if (buffer) { - buffer += i_error; - buffer_size -= i_error; - } - b_error = ext3_xattr_block_list(dentry, buffer, buffer_size); - if (b_error < 0) - i_error = 0; - } - up_read(&EXT3_I(d_inode(dentry))->xattr_sem); - return i_error + b_error; -} - -/* - * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is - * not set, set it. - */ -static void ext3_xattr_update_super_block(handle_t *handle, - struct super_block *sb) -{ - if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR)) - return; - - if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) { - EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR); - ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); - } -} - -/* - * Release the xattr block BH: If the reference count is > 1, decrement - * it; otherwise free the block. - */ -static void -ext3_xattr_release_block(handle_t *handle, struct inode *inode, - struct buffer_head *bh) -{ - struct mb_cache_entry *ce = NULL; - int error = 0; - - ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev, bh->b_blocknr); - error = ext3_journal_get_write_access(handle, bh); - if (error) - goto out; - - lock_buffer(bh); - - if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { - ea_bdebug(bh, "refcount now=0; freeing"); - if (ce) - mb_cache_entry_free(ce); - ext3_free_blocks(handle, inode, bh->b_blocknr, 1); - get_bh(bh); - ext3_forget(handle, 1, inode, bh, bh->b_blocknr); - } else { - le32_add_cpu(&BHDR(bh)->h_refcount, -1); - error = ext3_journal_dirty_metadata(handle, bh); - if (IS_SYNC(inode)) - handle->h_sync = 1; - dquot_free_block(inode, 1); - ea_bdebug(bh, "refcount now=%d; releasing", - le32_to_cpu(BHDR(bh)->h_refcount)); - if (ce) - mb_cache_entry_release(ce); - } - unlock_buffer(bh); -out: - ext3_std_error(inode->i_sb, error); - return; -} - -struct ext3_xattr_info { - int name_index; - const char *name; - const void *value; - size_t value_len; -}; - -struct ext3_xattr_search { - struct ext3_xattr_entry *first; - void *base; - void *end; - struct ext3_xattr_entry *here; - int not_found; -}; - -static int -ext3_xattr_set_entry(struct ext3_xattr_info *i, struct ext3_xattr_search *s) -{ - struct ext3_xattr_entry *last; - size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); - - /* Compute min_offs and last. */ - last = s->first; - for (; !IS_LAST_ENTRY(last); last = EXT3_XATTR_NEXT(last)) { - if (!last->e_value_block && last->e_value_size) { - size_t offs = le16_to_cpu(last->e_value_offs); - if (offs < min_offs) - min_offs = offs; - } - } - free = min_offs - ((void *)last - s->base) - sizeof(__u32); - if (!s->not_found) { - if (!s->here->e_value_block && s->here->e_value_size) { - size_t size = le32_to_cpu(s->here->e_value_size); - free += EXT3_XATTR_SIZE(size); - } - free += EXT3_XATTR_LEN(name_len); - } - if (i->value) { - if (free < EXT3_XATTR_LEN(name_len) + - EXT3_XATTR_SIZE(i->value_len)) - return -ENOSPC; - } - - if (i->value && s->not_found) { - /* Insert the new name. */ - size_t size = EXT3_XATTR_LEN(name_len); - size_t rest = (void *)last - (void *)s->here + sizeof(__u32); - memmove((void *)s->here + size, s->here, rest); - memset(s->here, 0, size); - s->here->e_name_index = i->name_index; - s->here->e_name_len = name_len; - memcpy(s->here->e_name, i->name, name_len); - } else { - if (!s->here->e_value_block && s->here->e_value_size) { - void *first_val = s->base + min_offs; - size_t offs = le16_to_cpu(s->here->e_value_offs); - void *val = s->base + offs; - size_t size = EXT3_XATTR_SIZE( - le32_to_cpu(s->here->e_value_size)); - - if (i->value && size == EXT3_XATTR_SIZE(i->value_len)) { - /* The old and the new value have the same - size. Just replace. */ - s->here->e_value_size = - cpu_to_le32(i->value_len); - memset(val + size - EXT3_XATTR_PAD, 0, - EXT3_XATTR_PAD); /* Clear pad bytes. */ - memcpy(val, i->value, i->value_len); - return 0; - } - - /* Remove the old value. */ - memmove(first_val + size, first_val, val - first_val); - memset(first_val, 0, size); - s->here->e_value_size = 0; - s->here->e_value_offs = 0; - min_offs += size; - - /* Adjust all value offsets. */ - last = s->first; - while (!IS_LAST_ENTRY(last)) { - size_t o = le16_to_cpu(last->e_value_offs); - if (!last->e_value_block && - last->e_value_size && o < offs) - last->e_value_offs = - cpu_to_le16(o + size); - last = EXT3_XATTR_NEXT(last); - } - } - if (!i->value) { - /* Remove the old name. */ - size_t size = EXT3_XATTR_LEN(name_len); - last = ENTRY((void *)last - size); - memmove(s->here, (void *)s->here + size, - (void *)last - (void *)s->here + sizeof(__u32)); - memset(last, 0, size); - } - } - - if (i->value) { - /* Insert the new value. */ - s->here->e_value_size = cpu_to_le32(i->value_len); - if (i->value_len) { - size_t size = EXT3_XATTR_SIZE(i->value_len); - void *val = s->base + min_offs - size; - s->here->e_value_offs = cpu_to_le16(min_offs - size); - memset(val + size - EXT3_XATTR_PAD, 0, - EXT3_XATTR_PAD); /* Clear the pad bytes. */ - memcpy(val, i->value, i->value_len); - } - } - return 0; -} - -struct ext3_xattr_block_find { - struct ext3_xattr_search s; - struct buffer_head *bh; -}; - -static int -ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i, - struct ext3_xattr_block_find *bs) -{ - struct super_block *sb = inode->i_sb; - int error; - - ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", - i->name_index, i->name, i->value, (long)i->value_len); - - if (EXT3_I(inode)->i_file_acl) { - /* The inode already has an extended attribute block. */ - bs->bh = sb_bread(sb, EXT3_I(inode)->i_file_acl); - error = -EIO; - if (!bs->bh) - goto cleanup; - ea_bdebug(bs->bh, "b_count=%d, refcount=%d", - atomic_read(&(bs->bh->b_count)), - le32_to_cpu(BHDR(bs->bh)->h_refcount)); - if (ext3_xattr_check_block(bs->bh)) { - ext3_error(sb, __func__, - "inode %lu: bad block "E3FSBLK, inode->i_ino, - EXT3_I(inode)->i_file_acl); - error = -EIO; - goto cleanup; - } - /* Find the named attribute. */ - bs->s.base = BHDR(bs->bh); - bs->s.first = BFIRST(bs->bh); - bs->s.end = bs->bh->b_data + bs->bh->b_size; - bs->s.here = bs->s.first; - error = ext3_xattr_find_entry(&bs->s.here, i->name_index, - i->name, bs->bh->b_size, 1); - if (error && error != -ENODATA) - goto cleanup; - bs->s.not_found = error; - } - error = 0; - -cleanup: - return error; -} - -static int -ext3_xattr_block_set(handle_t *handle, struct inode *inode, - struct ext3_xattr_info *i, - struct ext3_xattr_block_find *bs) -{ - struct super_block *sb = inode->i_sb; - struct buffer_head *new_bh = NULL; - struct ext3_xattr_search *s = &bs->s; - struct mb_cache_entry *ce = NULL; - int error = 0; - -#define header(x) ((struct ext3_xattr_header *)(x)) - - if (i->value && i->value_len > sb->s_blocksize) - return -ENOSPC; - if (s->base) { - ce = mb_cache_entry_get(ext3_xattr_cache, bs->bh->b_bdev, - bs->bh->b_blocknr); - error = ext3_journal_get_write_access(handle, bs->bh); - if (error) - goto cleanup; - lock_buffer(bs->bh); - - if (header(s->base)->h_refcount == cpu_to_le32(1)) { - if (ce) { - mb_cache_entry_free(ce); - ce = NULL; - } - ea_bdebug(bs->bh, "modifying in-place"); - error = ext3_xattr_set_entry(i, s); - if (!error) { - if (!IS_LAST_ENTRY(s->first)) - ext3_xattr_rehash(header(s->base), - s->here); - ext3_xattr_cache_insert(bs->bh); - } - unlock_buffer(bs->bh); - if (error == -EIO) - goto bad_block; - if (!error) - error = ext3_journal_dirty_metadata(handle, - bs->bh); - if (error) - goto cleanup; - goto inserted; - } else { - int offset = (char *)s->here - bs->bh->b_data; - - unlock_buffer(bs->bh); - journal_release_buffer(handle, bs->bh); - - if (ce) { - mb_cache_entry_release(ce); - ce = NULL; - } - ea_bdebug(bs->bh, "cloning"); - s->base = kmalloc(bs->bh->b_size, GFP_NOFS); - error = -ENOMEM; - if (s->base == NULL) - goto cleanup; - memcpy(s->base, BHDR(bs->bh), bs->bh->b_size); - s->first = ENTRY(header(s->base)+1); - header(s->base)->h_refcount = cpu_to_le32(1); - s->here = ENTRY(s->base + offset); - s->end = s->base + bs->bh->b_size; - } - } else { - /* Allocate a buffer where we construct the new block. */ - s->base = kzalloc(sb->s_blocksize, GFP_NOFS); - /* assert(header == s->base) */ - error = -ENOMEM; - if (s->base == NULL) - goto cleanup; - header(s->base)->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); - header(s->base)->h_blocks = cpu_to_le32(1); - header(s->base)->h_refcount = cpu_to_le32(1); - s->first = ENTRY(header(s->base)+1); - s->here = ENTRY(header(s->base)+1); - s->end = s->base + sb->s_blocksize; - } - - error = ext3_xattr_set_entry(i, s); - if (error == -EIO) - goto bad_block; - if (error) - goto cleanup; - if (!IS_LAST_ENTRY(s->first)) - ext3_xattr_rehash(header(s->base), s->here); - -inserted: - if (!IS_LAST_ENTRY(s->first)) { - new_bh = ext3_xattr_cache_find(inode, header(s->base), &ce); - if (new_bh) { - /* We found an identical block in the cache. */ - if (new_bh == bs->bh) - ea_bdebug(new_bh, "keeping"); - else { - /* The old block is released after updating - the inode. */ - error = dquot_alloc_block(inode, 1); - if (error) - goto cleanup; - error = ext3_journal_get_write_access(handle, - new_bh); - if (error) - goto cleanup_dquot; - lock_buffer(new_bh); - le32_add_cpu(&BHDR(new_bh)->h_refcount, 1); - ea_bdebug(new_bh, "reusing; refcount now=%d", - le32_to_cpu(BHDR(new_bh)->h_refcount)); - unlock_buffer(new_bh); - error = ext3_journal_dirty_metadata(handle, - new_bh); - if (error) - goto cleanup_dquot; - } - mb_cache_entry_release(ce); - ce = NULL; - } else if (bs->bh && s->base == bs->bh->b_data) { - /* We were modifying this block in-place. */ - ea_bdebug(bs->bh, "keeping this block"); - new_bh = bs->bh; - get_bh(new_bh); - } else { - /* We need to allocate a new block */ - ext3_fsblk_t goal = ext3_group_first_block_no(sb, - EXT3_I(inode)->i_block_group); - ext3_fsblk_t block; - - /* - * Protect us agaist concurrent allocations to the - * same inode from ext3_..._writepage(). Reservation - * code does not expect racing allocations. - */ - mutex_lock(&EXT3_I(inode)->truncate_mutex); - block = ext3_new_block(handle, inode, goal, &error); - mutex_unlock(&EXT3_I(inode)->truncate_mutex); - if (error) - goto cleanup; - ea_idebug(inode, "creating block %d", block); - - new_bh = sb_getblk(sb, block); - if (unlikely(!new_bh)) { -getblk_failed: - ext3_free_blocks(handle, inode, block, 1); - error = -ENOMEM; - goto cleanup; - } - lock_buffer(new_bh); - error = ext3_journal_get_create_access(handle, new_bh); - if (error) { - unlock_buffer(new_bh); - goto getblk_failed; - } - memcpy(new_bh->b_data, s->base, new_bh->b_size); - set_buffer_uptodate(new_bh); - unlock_buffer(new_bh); - ext3_xattr_cache_insert(new_bh); - error = ext3_journal_dirty_metadata(handle, new_bh); - if (error) - goto cleanup; - } - } - - /* Update the inode. */ - EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; - - /* Drop the previous xattr block. */ - if (bs->bh && bs->bh != new_bh) - ext3_xattr_release_block(handle, inode, bs->bh); - error = 0; - -cleanup: - if (ce) - mb_cache_entry_release(ce); - brelse(new_bh); - if (!(bs->bh && s->base == bs->bh->b_data)) - kfree(s->base); - - return error; - -cleanup_dquot: - dquot_free_block(inode, 1); - goto cleanup; - -bad_block: - ext3_error(inode->i_sb, __func__, - "inode %lu: bad block "E3FSBLK, inode->i_ino, - EXT3_I(inode)->i_file_acl); - goto cleanup; - -#undef header -} - -struct ext3_xattr_ibody_find { - struct ext3_xattr_search s; - struct ext3_iloc iloc; -}; - -static int -ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i, - struct ext3_xattr_ibody_find *is) -{ - struct ext3_xattr_ibody_header *header; - struct ext3_inode *raw_inode; - int error; - - if (EXT3_I(inode)->i_extra_isize == 0) - return 0; - raw_inode = ext3_raw_inode(&is->iloc); - header = IHDR(inode, raw_inode); - is->s.base = is->s.first = IFIRST(header); - is->s.here = is->s.first; - is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; - if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) { - error = ext3_xattr_check_names(IFIRST(header), is->s.end); - if (error) - return error; - /* Find the named attribute. */ - error = ext3_xattr_find_entry(&is->s.here, i->name_index, - i->name, is->s.end - - (void *)is->s.base, 0); - if (error && error != -ENODATA) - return error; - is->s.not_found = error; - } - return 0; -} - -static int -ext3_xattr_ibody_set(handle_t *handle, struct inode *inode, - struct ext3_xattr_info *i, - struct ext3_xattr_ibody_find *is) -{ - struct ext3_xattr_ibody_header *header; - struct ext3_xattr_search *s = &is->s; - int error; - - if (EXT3_I(inode)->i_extra_isize == 0) - return -ENOSPC; - error = ext3_xattr_set_entry(i, s); - if (error) - return error; - header = IHDR(inode, ext3_raw_inode(&is->iloc)); - if (!IS_LAST_ENTRY(s->first)) { - header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); - ext3_set_inode_state(inode, EXT3_STATE_XATTR); - } else { - header->h_magic = cpu_to_le32(0); - ext3_clear_inode_state(inode, EXT3_STATE_XATTR); - } - return 0; -} - -/* - * ext3_xattr_set_handle() - * - * Create, replace or remove an extended attribute for this inode. Value - * is NULL to remove an existing extended attribute, and non-NULL to - * either replace an existing extended attribute, or create a new extended - * attribute. The flags XATTR_REPLACE and XATTR_CREATE - * specify that an extended attribute must exist and must not exist - * previous to the call, respectively. - * - * Returns 0, or a negative error number on failure. - */ -int -ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, - const char *name, const void *value, size_t value_len, - int flags) -{ - struct ext3_xattr_info i = { - .name_index = name_index, - .name = name, - .value = value, - .value_len = value_len, - - }; - struct ext3_xattr_ibody_find is = { - .s = { .not_found = -ENODATA, }, - }; - struct ext3_xattr_block_find bs = { - .s = { .not_found = -ENODATA, }, - }; - int error; - - if (!name) - return -EINVAL; - if (strlen(name) > 255) - return -ERANGE; - down_write(&EXT3_I(inode)->xattr_sem); - error = ext3_get_inode_loc(inode, &is.iloc); - if (error) - goto cleanup; - - error = ext3_journal_get_write_access(handle, is.iloc.bh); - if (error) - goto cleanup; - - if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) { - struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc); - memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); - ext3_clear_inode_state(inode, EXT3_STATE_NEW); - } - - error = ext3_xattr_ibody_find(inode, &i, &is); - if (error) - goto cleanup; - if (is.s.not_found) - error = ext3_xattr_block_find(inode, &i, &bs); - if (error) - goto cleanup; - if (is.s.not_found && bs.s.not_found) { - error = -ENODATA; - if (flags & XATTR_REPLACE) - goto cleanup; - error = 0; - if (!value) - goto cleanup; - } else { - error = -EEXIST; - if (flags & XATTR_CREATE) - goto cleanup; - } - if (!value) { - if (!is.s.not_found) - error = ext3_xattr_ibody_set(handle, inode, &i, &is); - else if (!bs.s.not_found) - error = ext3_xattr_block_set(handle, inode, &i, &bs); - } else { - error = ext3_xattr_ibody_set(handle, inode, &i, &is); - if (!error && !bs.s.not_found) { - i.value = NULL; - error = ext3_xattr_block_set(handle, inode, &i, &bs); - } else if (error == -ENOSPC) { - if (EXT3_I(inode)->i_file_acl && !bs.s.base) { - error = ext3_xattr_block_find(inode, &i, &bs); - if (error) - goto cleanup; - } - error = ext3_xattr_block_set(handle, inode, &i, &bs); - if (error) - goto cleanup; - if (!is.s.not_found) { - i.value = NULL; - error = ext3_xattr_ibody_set(handle, inode, &i, - &is); - } - } - } - if (!error) { - ext3_xattr_update_super_block(handle, inode->i_sb); - inode->i_ctime = CURRENT_TIME_SEC; - error = ext3_mark_iloc_dirty(handle, inode, &is.iloc); - /* - * The bh is consumed by ext3_mark_iloc_dirty, even with - * error != 0. - */ - is.iloc.bh = NULL; - if (IS_SYNC(inode)) - handle->h_sync = 1; - } - -cleanup: - brelse(is.iloc.bh); - brelse(bs.bh); - up_write(&EXT3_I(inode)->xattr_sem); - return error; -} - -/* - * ext3_xattr_set() - * - * Like ext3_xattr_set_handle, but start from an inode. This extended - * attribute modification is a filesystem transaction by itself. - * - * Returns 0, or a negative error number on failure. - */ -int -ext3_xattr_set(struct inode *inode, int name_index, const char *name, - const void *value, size_t value_len, int flags) -{ - handle_t *handle; - int error, retries = 0; - -retry: - handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - } else { - int error2; - - error = ext3_xattr_set_handle(handle, inode, name_index, name, - value, value_len, flags); - error2 = ext3_journal_stop(handle); - if (error == -ENOSPC && - ext3_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - if (error == 0) - error = error2; - } - - return error; -} - -/* - * ext3_xattr_delete_inode() - * - * Free extended attribute resources associated with this inode. This - * is called immediately before an inode is freed. We have exclusive - * access to the inode. - */ -void -ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) -{ - struct buffer_head *bh = NULL; - - if (!EXT3_I(inode)->i_file_acl) - goto cleanup; - bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); - if (!bh) { - ext3_error(inode->i_sb, __func__, - "inode %lu: block "E3FSBLK" read error", inode->i_ino, - EXT3_I(inode)->i_file_acl); - goto cleanup; - } - if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || - BHDR(bh)->h_blocks != cpu_to_le32(1)) { - ext3_error(inode->i_sb, __func__, - "inode %lu: bad block "E3FSBLK, inode->i_ino, - EXT3_I(inode)->i_file_acl); - goto cleanup; - } - ext3_xattr_release_block(handle, inode, bh); - EXT3_I(inode)->i_file_acl = 0; - -cleanup: - brelse(bh); -} - -/* - * ext3_xattr_put_super() - * - * This is called when a file system is unmounted. - */ -void -ext3_xattr_put_super(struct super_block *sb) -{ - mb_cache_shrink(sb->s_bdev); -} - -/* - * ext3_xattr_cache_insert() - * - * Create a new entry in the extended attribute cache, and insert - * it unless such an entry is already in the cache. - * - * Returns 0, or a negative error number on failure. - */ -static void -ext3_xattr_cache_insert(struct buffer_head *bh) -{ - __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); - struct mb_cache_entry *ce; - int error; - - ce = mb_cache_entry_alloc(ext3_xattr_cache, GFP_NOFS); - if (!ce) { - ea_bdebug(bh, "out of memory"); - return; - } - error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); - if (error) { - mb_cache_entry_free(ce); - if (error == -EBUSY) { - ea_bdebug(bh, "already in cache"); - error = 0; - } - } else { - ea_bdebug(bh, "inserting [%x]", (int)hash); - mb_cache_entry_release(ce); - } -} - -/* - * ext3_xattr_cmp() - * - * Compare two extended attribute blocks for equality. - * - * Returns 0 if the blocks are equal, 1 if they differ, and - * a negative error number on errors. - */ -static int -ext3_xattr_cmp(struct ext3_xattr_header *header1, - struct ext3_xattr_header *header2) -{ - struct ext3_xattr_entry *entry1, *entry2; - - entry1 = ENTRY(header1+1); - entry2 = ENTRY(header2+1); - while (!IS_LAST_ENTRY(entry1)) { - if (IS_LAST_ENTRY(entry2)) - return 1; - if (entry1->e_hash != entry2->e_hash || - entry1->e_name_index != entry2->e_name_index || - entry1->e_name_len != entry2->e_name_len || - entry1->e_value_size != entry2->e_value_size || - memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) - return 1; - if (entry1->e_value_block != 0 || entry2->e_value_block != 0) - return -EIO; - if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), - (char *)header2 + le16_to_cpu(entry2->e_value_offs), - le32_to_cpu(entry1->e_value_size))) - return 1; - - entry1 = EXT3_XATTR_NEXT(entry1); - entry2 = EXT3_XATTR_NEXT(entry2); - } - if (!IS_LAST_ENTRY(entry2)) - return 1; - return 0; -} - -/* - * ext3_xattr_cache_find() - * - * Find an identical extended attribute block. - * - * Returns a pointer to the block found, or NULL if such a block was - * not found or an error occurred. - */ -static struct buffer_head * -ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header, - struct mb_cache_entry **pce) -{ - __u32 hash = le32_to_cpu(header->h_hash); - struct mb_cache_entry *ce; - - if (!header->h_hash) - return NULL; /* never share */ - ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); -again: - ce = mb_cache_entry_find_first(ext3_xattr_cache, inode->i_sb->s_bdev, - hash); - while (ce) { - struct buffer_head *bh; - - if (IS_ERR(ce)) { - if (PTR_ERR(ce) == -EAGAIN) - goto again; - break; - } - bh = sb_bread(inode->i_sb, ce->e_block); - if (!bh) { - ext3_error(inode->i_sb, __func__, - "inode %lu: block %lu read error", - inode->i_ino, (unsigned long) ce->e_block); - } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= - EXT3_XATTR_REFCOUNT_MAX) { - ea_idebug(inode, "block %lu refcount %d>=%d", - (unsigned long) ce->e_block, - le32_to_cpu(BHDR(bh)->h_refcount), - EXT3_XATTR_REFCOUNT_MAX); - } else if (ext3_xattr_cmp(header, BHDR(bh)) == 0) { - *pce = ce; - return bh; - } - brelse(bh); - ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); - } - return NULL; -} - -#define NAME_HASH_SHIFT 5 -#define VALUE_HASH_SHIFT 16 - -/* - * ext3_xattr_hash_entry() - * - * Compute the hash of an extended attribute. - */ -static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header, - struct ext3_xattr_entry *entry) -{ - __u32 hash = 0; - char *name = entry->e_name; - int n; - - for (n=0; n < entry->e_name_len; n++) { - hash = (hash << NAME_HASH_SHIFT) ^ - (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ - *name++; - } - - if (entry->e_value_block == 0 && entry->e_value_size != 0) { - __le32 *value = (__le32 *)((char *)header + - le16_to_cpu(entry->e_value_offs)); - for (n = (le32_to_cpu(entry->e_value_size) + - EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) { - hash = (hash << VALUE_HASH_SHIFT) ^ - (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ - le32_to_cpu(*value++); - } - } - entry->e_hash = cpu_to_le32(hash); -} - -#undef NAME_HASH_SHIFT -#undef VALUE_HASH_SHIFT - -#define BLOCK_HASH_SHIFT 16 - -/* - * ext3_xattr_rehash() - * - * Re-compute the extended attribute hash value after an entry has changed. - */ -static void ext3_xattr_rehash(struct ext3_xattr_header *header, - struct ext3_xattr_entry *entry) -{ - struct ext3_xattr_entry *here; - __u32 hash = 0; - - ext3_xattr_hash_entry(header, entry); - here = ENTRY(header+1); - while (!IS_LAST_ENTRY(here)) { - if (!here->e_hash) { - /* Block is not shared if an entry's hash value == 0 */ - hash = 0; - break; - } - hash = (hash << BLOCK_HASH_SHIFT) ^ - (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ - le32_to_cpu(here->e_hash); - here = EXT3_XATTR_NEXT(here); - } - header->h_hash = cpu_to_le32(hash); -} - -#undef BLOCK_HASH_SHIFT - -int __init -init_ext3_xattr(void) -{ - ext3_xattr_cache = mb_cache_create("ext3_xattr", 6); - if (!ext3_xattr_cache) - return -ENOMEM; - return 0; -} - -void -exit_ext3_xattr(void) -{ - if (ext3_xattr_cache) - mb_cache_destroy(ext3_xattr_cache); - ext3_xattr_cache = NULL; -} diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h deleted file mode 100644 index 32e93ebf8031..000000000000 --- a/fs/ext3/xattr.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - File: fs/ext3/xattr.h - - On-disk format of extended attributes for the ext3 filesystem. - - (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> -*/ - -#include <linux/xattr.h> - -/* Magic value in attribute blocks */ -#define EXT3_XATTR_MAGIC 0xEA020000 - -/* Maximum number of references to one attribute block */ -#define EXT3_XATTR_REFCOUNT_MAX 1024 - -/* Name indexes */ -#define EXT3_XATTR_INDEX_USER 1 -#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS 2 -#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT 3 -#define EXT3_XATTR_INDEX_TRUSTED 4 -#define EXT3_XATTR_INDEX_LUSTRE 5 -#define EXT3_XATTR_INDEX_SECURITY 6 - -struct ext3_xattr_header { - __le32 h_magic; /* magic number for identification */ - __le32 h_refcount; /* reference count */ - __le32 h_blocks; /* number of disk blocks used */ - __le32 h_hash; /* hash value of all attributes */ - __u32 h_reserved[4]; /* zero right now */ -}; - -struct ext3_xattr_ibody_header { - __le32 h_magic; /* magic number for identification */ -}; - -struct ext3_xattr_entry { - __u8 e_name_len; /* length of name */ - __u8 e_name_index; /* attribute name index */ - __le16 e_value_offs; /* offset in disk block of value */ - __le32 e_value_block; /* disk block attribute is stored on (n/i) */ - __le32 e_value_size; /* size of attribute value */ - __le32 e_hash; /* hash value of name and value */ - char e_name[0]; /* attribute name */ -}; - -#define EXT3_XATTR_PAD_BITS 2 -#define EXT3_XATTR_PAD (1<<EXT3_XATTR_PAD_BITS) -#define EXT3_XATTR_ROUND (EXT3_XATTR_PAD-1) -#define EXT3_XATTR_LEN(name_len) \ - (((name_len) + EXT3_XATTR_ROUND + \ - sizeof(struct ext3_xattr_entry)) & ~EXT3_XATTR_ROUND) -#define EXT3_XATTR_NEXT(entry) \ - ( (struct ext3_xattr_entry *)( \ - (char *)(entry) + EXT3_XATTR_LEN((entry)->e_name_len)) ) -#define EXT3_XATTR_SIZE(size) \ - (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND) - -# ifdef CONFIG_EXT3_FS_XATTR - -extern const struct xattr_handler ext3_xattr_user_handler; -extern const struct xattr_handler ext3_xattr_trusted_handler; -extern const struct xattr_handler ext3_xattr_security_handler; - -extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); - -extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); -extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int); -extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); - -extern void ext3_xattr_delete_inode(handle_t *, struct inode *); -extern void ext3_xattr_put_super(struct super_block *); - -extern int init_ext3_xattr(void); -extern void exit_ext3_xattr(void); - -extern const struct xattr_handler *ext3_xattr_handlers[]; - -# else /* CONFIG_EXT3_FS_XATTR */ - -static inline int -ext3_xattr_get(struct inode *inode, int name_index, const char *name, - void *buffer, size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static inline int -ext3_xattr_set(struct inode *inode, int name_index, const char *name, - const void *value, size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static inline int -ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, - const char *name, const void *value, size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static inline void -ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) -{ -} - -static inline void -ext3_xattr_put_super(struct super_block *sb) -{ -} - -static inline int -init_ext3_xattr(void) -{ - return 0; -} - -static inline void -exit_ext3_xattr(void) -{ -} - -#define ext3_xattr_handlers NULL - -# endif /* CONFIG_EXT3_FS_XATTR */ - -#ifdef CONFIG_EXT3_FS_SECURITY -extern int ext3_init_security(handle_t *handle, struct inode *inode, - struct inode *dir, const struct qstr *qstr); -#else -static inline int ext3_init_security(handle_t *handle, struct inode *inode, - struct inode *dir, const struct qstr *qstr) -{ - return 0; -} -#endif diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c deleted file mode 100644 index c9506d5e3b13..000000000000 --- a/fs/ext3/xattr_security.c +++ /dev/null @@ -1,78 +0,0 @@ -/* - * linux/fs/ext3/xattr_security.c - * Handler for storing security labels as extended attributes. - */ - -#include <linux/security.h> -#include "ext3.h" -#include "xattr.h" - -static size_t -ext3_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - - if (list && total_len <= list_size) { - memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - -static int -ext3_xattr_security_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - if (strcmp(name, "") == 0) - return -EINVAL; - return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY, - name, buffer, size); -} - -static int -ext3_xattr_security_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - if (strcmp(name, "") == 0) - return -EINVAL; - return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY, - name, value, size, flags); -} - -static int ext3_initxattrs(struct inode *inode, - const struct xattr *xattr_array, - void *fs_info) -{ - const struct xattr *xattr; - handle_t *handle = fs_info; - int err = 0; - - for (xattr = xattr_array; xattr->name != NULL; xattr++) { - err = ext3_xattr_set_handle(handle, inode, - EXT3_XATTR_INDEX_SECURITY, - xattr->name, xattr->value, - xattr->value_len, 0); - if (err < 0) - break; - } - return err; -} - -int -ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir, - const struct qstr *qstr) -{ - return security_inode_init_security(inode, dir, qstr, - &ext3_initxattrs, handle); -} - -const struct xattr_handler ext3_xattr_security_handler = { - .prefix = XATTR_SECURITY_PREFIX, - .list = ext3_xattr_security_list, - .get = ext3_xattr_security_get, - .set = ext3_xattr_security_set, -}; diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c deleted file mode 100644 index 206cc66dc285..000000000000 --- a/fs/ext3/xattr_trusted.c +++ /dev/null @@ -1,54 +0,0 @@ -/* - * linux/fs/ext3/xattr_trusted.c - * Handler for trusted extended attributes. - * - * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org> - */ - -#include "ext3.h" -#include "xattr.h" - -static size_t -ext3_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (!capable(CAP_SYS_ADMIN)) - return 0; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - -static int -ext3_xattr_trusted_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - if (strcmp(name, "") == 0) - return -EINVAL; - return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED, - name, buffer, size); -} - -static int -ext3_xattr_trusted_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - if (strcmp(name, "") == 0) - return -EINVAL; - return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED, name, - value, size, flags); -} - -const struct xattr_handler ext3_xattr_trusted_handler = { - .prefix = XATTR_TRUSTED_PREFIX, - .list = ext3_xattr_trusted_list, - .get = ext3_xattr_trusted_get, - .set = ext3_xattr_trusted_set, -}; diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c deleted file mode 100644 index 021508ad1616..000000000000 --- a/fs/ext3/xattr_user.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * linux/fs/ext3/xattr_user.c - * Handler for extended user attributes. - * - * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> - */ - -#include "ext3.h" -#include "xattr.h" - -static size_t -ext3_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - const size_t prefix_len = XATTR_USER_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (!test_opt(dentry->d_sb, XATTR_USER)) - return 0; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_USER_PREFIX, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - -static int -ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) -{ - if (strcmp(name, "") == 0) - return -EINVAL; - if (!test_opt(dentry->d_sb, XATTR_USER)) - return -EOPNOTSUPP; - return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_USER, - name, buffer, size); -} - -static int -ext3_xattr_user_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - if (strcmp(name, "") == 0) - return -EINVAL; - if (!test_opt(dentry->d_sb, XATTR_USER)) - return -EOPNOTSUPP; - return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_USER, - name, value, size, flags); -} - -const struct xattr_handler ext3_xattr_user_handler = { - .prefix = XATTR_USER_PREFIX, - .list = ext3_xattr_user_list, - .get = ext3_xattr_user_get, - .set = ext3_xattr_user_set, -}; diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index bf8bc8aba471..47728da7702c 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -1,5 +1,38 @@ +# Ext3 configs are here for backward compatibility with old configs which may +# have EXT3_FS set but not EXT4_FS set and thus would result in non-bootable +# kernels after the removal of ext3 driver. +config EXT3_FS + tristate "The Extended 3 (ext3) filesystem" + # These must match EXT4_FS selects... + select EXT4_FS + select JBD2 + select CRC16 + select CRYPTO + select CRYPTO_CRC32C + help + This config option is here only for backward compatibility. ext3 + filesystem is now handled by the ext4 driver. + +config EXT3_FS_POSIX_ACL + bool "Ext3 POSIX Access Control Lists" + depends on EXT3_FS + select EXT4_FS_POSIX_ACL + select FS_POSIX_ACL + help + This config option is here only for backward compatibility. ext3 + filesystem is now handled by the ext4 driver. + +config EXT3_FS_SECURITY + bool "Ext3 Security Labels" + depends on EXT3_FS + select EXT4_FS_SECURITY + help + This config option is here only for backward compatibility. ext3 + filesystem is now handled by the ext4 driver. + config EXT4_FS tristate "The Extended 4 (ext4) filesystem" + # Please update EXT3_FS selects when changing these select JBD2 select CRC16 select CRYPTO @@ -16,26 +49,27 @@ config EXT4_FS up fsck time. For more information, please see the web pages at http://ext4.wiki.kernel.org. - The ext4 filesystem will support mounting an ext3 - filesystem; while there will be some performance gains from - the delayed allocation and inode table readahead, the best - performance gains will require enabling ext4 features in the - filesystem, or formatting a new filesystem as an ext4 - filesystem initially. + The ext4 filesystem supports mounting an ext3 filesystem; while there + are some performance gains from the delayed allocation and inode + table readahead, the best performance gains require enabling ext4 + features in the filesystem using tune2fs, or formatting a new + filesystem as an ext4 filesystem initially. Without explicit enabling + of ext4 features, the on disk filesystem format stays fully backward + compatible. To compile this file system support as a module, choose M here. The module will be called ext4. If unsure, say N. -config EXT4_USE_FOR_EXT23 +config EXT4_USE_FOR_EXT2 bool "Use ext4 for ext2/ext3 file systems" depends on EXT4_FS - depends on EXT3_FS=n || EXT2_FS=n + depends on EXT2_FS=n default y help - Allow the ext4 file system driver code to be used for ext2 or - ext3 file system mounts. This allows users to reduce their + Allow the ext4 file system driver code to be used for ext2 + file system mounts. This allows users to reduce their compiled kernel size by using one file system driver for ext2, ext3, and ext4 file systems. diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 7dc4eb55913c..847f919c84d9 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -19,7 +19,6 @@ #include <linux/gfp.h> #include <linux/kernel.h> #include <linux/key.h> -#include <linux/key.h> #include <linux/list.h> #include <linux/mempool.h> #include <linux/random.h> @@ -329,6 +328,10 @@ int _ext4_fname_disk_to_usr(struct inode *inode, return oname->len; } } + if (iname->len < EXT4_CRYPTO_BLOCK_SIZE) { + EXT4_ERROR_INODE(inode, "encrypted inode too small"); + return -EUCLEAN; + } if (EXT4_I(inode)->i_crypt_info) return ext4_fname_decrypt(inode, iname, oname); diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 442d24e8efc0..1d510c11b100 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -30,7 +30,7 @@ static void derive_crypt_complete(struct crypto_async_request *req, int rc) /** * ext4_derive_key_aes() - Derive a key using AES-128-ECB - * @deriving_key: Encryption key used for derivatio. + * @deriving_key: Encryption key used for derivation. * @source_key: Source key to which to apply derivation. * @derived_key: Derived key. * @@ -220,6 +220,8 @@ retry: BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE); res = ext4_derive_key_aes(ctx.nonce, master_key->raw, raw_key); + if (res) + goto out; got_key: ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0); if (!ctfm || IS_ERR(ctfm)) { diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index 02c4e5df7afb..a640ec2c4b13 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -12,6 +12,7 @@ #include <linux/string.h> #include <linux/types.h> +#include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" @@ -49,7 +50,8 @@ static int ext4_create_encryption_context_from_policy( struct inode *inode, const struct ext4_encryption_policy *policy) { struct ext4_encryption_context ctx; - int res = 0; + handle_t *handle; + int res, res2; res = ext4_convert_inline_data(inode); if (res) @@ -78,11 +80,22 @@ static int ext4_create_encryption_context_from_policy( BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE); get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); + handle = ext4_journal_start(inode, EXT4_HT_MISC, + ext4_jbd2_credits_xattr(inode)); + if (IS_ERR(handle)) + return PTR_ERR(handle); res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, sizeof(ctx), 0); - if (!res) + if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + res = ext4_mark_inode_dirty(handle, inode); + if (res) + EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); + } + res2 = ext4_journal_stop(handle); + if (!res) + res = res2; return res; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f5e9f04220c1..fd1f28be5296 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -187,7 +187,7 @@ typedef struct ext4_io_end { } ext4_io_end_t; struct ext4_io_submit { - int io_op; + struct writeback_control *io_wbc; struct bio *io_bio; ext4_io_end_t *io_end; sector_t io_next_block; @@ -2272,6 +2272,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); +int ext4_get_block_dax(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index aadb72828834..2553aa8b608d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -504,7 +504,7 @@ __read_extent_tree_block(const char *function, unsigned int line, struct buffer_head *bh; int err; - bh = sb_getblk(inode->i_sb, pblk); + bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); @@ -1089,7 +1089,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, err = -EIO; goto cleanup; } - bh = sb_getblk(inode->i_sb, newblock); + bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) { err = -ENOMEM; goto cleanup; @@ -1283,7 +1283,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, if (newblock == 0) return err; - bh = sb_getblk(inode->i_sb, newblock); + bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) return -ENOMEM; lock_buffer(bh); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ac517f15741c..113837e7ba98 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -22,6 +22,7 @@ #include <linux/fs.h> #include <linux/mount.h> #include <linux/path.h> +#include <linux/dax.h> #include <linux/quotaops.h> #include <linux/pagevec.h> #include <linux/uio.h> @@ -192,19 +193,88 @@ out: } #ifdef CONFIG_FS_DAX +static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) +{ + struct inode *inode = bh->b_assoc_map->host; + /* XXX: breaks on 32-bit > 16TB. Is that even supported? */ + loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; + int err; + if (!uptodate) + return; + WARN_ON(!buffer_unwritten(bh)); + err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); +} + static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_fault(vma, vmf, ext4_get_block); - /* Is this the right get_block? */ + int result; + handle_t *handle = NULL; + struct super_block *sb = file_inode(vma->vm_file)->i_sb; + bool write = vmf->flags & FAULT_FLAG_WRITE; + + if (write) { + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, + EXT4_DATA_TRANS_BLOCKS(sb)); + } + + if (IS_ERR(handle)) + result = VM_FAULT_SIGBUS; + else + result = __dax_fault(vma, vmf, ext4_get_block_dax, + ext4_end_io_unwritten); + + if (write) { + if (!IS_ERR(handle)) + ext4_journal_stop(handle); + sb_end_pagefault(sb); + } + + return result; +} + +static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned int flags) +{ + int result; + handle_t *handle = NULL; + struct inode *inode = file_inode(vma->vm_file); + struct super_block *sb = inode->i_sb; + bool write = flags & FAULT_FLAG_WRITE; + + if (write) { + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, + ext4_chunk_trans_blocks(inode, + PMD_SIZE / PAGE_SIZE)); + } + + if (IS_ERR(handle)) + result = VM_FAULT_SIGBUS; + else + result = __dax_pmd_fault(vma, addr, pmd, flags, + ext4_get_block_dax, ext4_end_io_unwritten); + + if (write) { + if (!IS_ERR(handle)) + ext4_journal_stop(handle); + sb_end_pagefault(sb); + } + + return result; } static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_mkwrite(vma, vmf, ext4_get_block); + return dax_mkwrite(vma, vmf, ext4_get_block_dax, + ext4_end_io_unwritten); } static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, + .pmd_fault = ext4_dax_pmd_fault, .page_mkwrite = ext4_dax_mkwrite, .pfn_mkwrite = dax_pfn_mkwrite, }; @@ -232,7 +302,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; } else { vma->vm_ops = &ext4_file_vm_ops; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 173c1ae21395..619bfc1fda8c 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -721,7 +721,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, struct ext4_group_desc *gdp = NULL; struct ext4_inode_info *ei; struct ext4_sb_info *sbi; - int ret2, err = 0; + int ret2, err; struct inode *ret; ext4_group_t i; ext4_group_t flex_group; @@ -769,7 +769,9 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, inode->i_gid = dir->i_gid; } else inode_init_owner(inode, dir, mode); - dquot_initialize(inode); + err = dquot_initialize(inode); + if (err) + goto out; if (!goal) goal = sbi->s_inode_goal; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 4f6ac499f09e..2468261748b2 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -22,6 +22,7 @@ #include "ext4_jbd2.h" #include "truncate.h" +#include <linux/dax.h> #include <linux/uio.h> #include <trace/events/ext4.h> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f8a8d4ee7459..612fbcf76b5c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -22,6 +22,7 @@ #include <linux/time.h> #include <linux/highuid.h> #include <linux/pagemap.h> +#include <linux/dax.h> #include <linux/quotaops.h> #include <linux/string.h> #include <linux/buffer_head.h> @@ -656,18 +657,6 @@ has_zeroout: return retval; } -static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) -{ - struct inode *inode = bh->b_assoc_map->host; - /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ - loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; - int err; - if (!uptodate) - return; - WARN_ON(!buffer_unwritten(bh)); - err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); -} - /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 @@ -705,10 +694,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; - if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) { + if (IS_DAX(inode) && buffer_unwritten(bh)) { + /* + * dgc: I suspect unwritten conversion on ext4+DAX is + * fundamentally broken here when there are concurrent + * read/write in progress on this inode. + */ + WARN_ON_ONCE(io_end); bh->b_assoc_map = inode->i_mapping; bh->b_private = (void *)(unsigned long)iblock; - bh->b_end_io = ext4_end_io_unwritten; } if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) set_buffer_defer_completion(bh); @@ -1330,7 +1324,7 @@ static void ext4_da_page_release_reservation(struct page *page, unsigned int offset, unsigned int length) { - int to_release = 0; + int to_release = 0, contiguous_blks = 0; struct buffer_head *head, *bh; unsigned int curr_off = 0; struct inode *inode = page->mapping->host; @@ -1351,14 +1345,23 @@ static void ext4_da_page_release_reservation(struct page *page, if ((offset <= curr_off) && (buffer_delay(bh))) { to_release++; + contiguous_blks++; clear_buffer_delay(bh); + } else if (contiguous_blks) { + lblk = page->index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + lblk += (curr_off >> inode->i_blkbits) - + contiguous_blks; + ext4_es_remove_extent(inode, lblk, contiguous_blks); + contiguous_blks = 0; } curr_off = next_off; } while ((bh = bh->b_this_page) != head); - if (to_release) { + if (contiguous_blks) { lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - ext4_es_remove_extent(inode, lblk, to_release); + lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; + ext4_es_remove_extent(inode, lblk, contiguous_blks); } /* If we have released all the blocks belonging to a cluster, then we @@ -3018,6 +3021,17 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, EXT4_GET_BLOCKS_NO_LOCK); } +int ext4_get_block_dax(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT; + if (create) + flags |= EXT4_GET_BLOCKS_CREATE; + ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n", + inode->i_ino, create); + return _ext4_get_block(inode, iblock, bh_result, flags); +} + static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ssize_t size, void *private) { @@ -4351,7 +4365,12 @@ static void ext4_update_other_inodes_time(struct super_block *sb, int inode_size = EXT4_INODE_SIZE(sb); oi.orig_ino = orig_ino; - ino = (orig_ino & ~(inodes_per_block - 1)) + 1; + /* + * Calculate the first inode in the inode table block. Inode + * numbers are one-based. That is, the first inode in a block + * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1). + */ + ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1; for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { if (ino == orig_ino) continue; @@ -4654,8 +4673,11 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; - if (is_quota_modification(inode, attr)) - dquot_initialize(inode); + if (is_quota_modification(inode, attr)) { + error = dquot_initialize(inode); + if (error) + return error; + } if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { handle_t *handle; @@ -4718,6 +4740,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) error = ext4_orphan_add(handle, inode); orphan = 1; } + /* + * Update c/mtime on truncate up, ext4_truncate() will + * update c/mtime in shrink case below + */ + if (!shrink) { + inode->i_mtime = ext4_current_time(inode); + inode->i_ctime = inode->i_mtime; + } down_write(&EXT4_I(inode)->i_data_sem); EXT4_I(inode)->i_disksize = attr->ia_size; rc = ext4_mark_inode_dirty(handle, inode); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index cb8451246b30..1346cfa355d0 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -755,7 +755,6 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return err; } case EXT4_IOC_MOVE_EXT: - case FITRIM: case EXT4_IOC_RESIZE_FS: case EXT4_IOC_PRECACHE_EXTENTS: case EXT4_IOC_SET_ENCRYPTION_POLICY: diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f6aedf88da43..34b610ea5030 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4816,18 +4816,12 @@ do_more: /* * blocks being freed are metadata. these blocks shouldn't * be used until this transaction is committed + * + * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed + * to fail. */ - retry: - new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); - if (!new_entry) { - /* - * We use a retry loop because - * ext4_free_blocks() is not allowed to fail. - */ - cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto retry; - } + new_entry = kmem_cache_alloc(ext4_free_data_cachep, + GFP_NOFS|__GFP_NOFAIL); new_entry->efd_start_cluster = bit; new_entry->efd_group = block_group; new_entry->efd_count = count_clusters; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index b52374e42102..6163ad21cb0e 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -620,6 +620,7 @@ int ext4_ind_migrate(struct inode *inode) struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_extent *ex; unsigned int i, len; + ext4_lblk_t start, end; ext4_fsblk_t blk; handle_t *handle; int ret; @@ -633,6 +634,14 @@ int ext4_ind_migrate(struct inode *inode) EXT4_FEATURE_RO_COMPAT_BIGALLOC)) return -EOPNOTSUPP; + /* + * In order to get correct extent info, force all delayed allocation + * blocks to be allocated, otherwise delayed allocation blocks may not + * be reflected and bypass the checks on extent header. + */ + if (test_opt(inode->i_sb, DELALLOC)) + ext4_alloc_da_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -650,11 +659,13 @@ int ext4_ind_migrate(struct inode *inode) goto errout; } if (eh->eh_entries == 0) - blk = len = 0; + blk = len = start = end = 0; else { len = le16_to_cpu(ex->ee_len); blk = ext4_ext_pblock(ex); - if (len > EXT4_NDIR_BLOCKS) { + start = le32_to_cpu(ex->ee_block); + end = start + len - 1; + if (end >= EXT4_NDIR_BLOCKS) { ret = -EOPNOTSUPP; goto errout; } @@ -662,7 +673,7 @@ int ext4_ind_migrate(struct inode *inode) ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); memset(ei->i_data, 0, sizeof(ei->i_data)); - for (i=0; i < len; i++) + for (i = start; i <= end; i++) ei->i_data[i] = cpu_to_le32(blk++); ext4_mark_inode_dirty(handle, inode); errout: diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 8313ca3324ec..6eb1a619890c 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -69,6 +69,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, ext4_fsblk_t mmp_block) { struct mmp_struct *mmp; + int ret; if (*bh) clear_buffer_uptodate(*bh); @@ -76,33 +77,36 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, /* This would be sb_bread(sb, mmp_block), except we need to be sure * that the MD RAID device cache has been bypassed, and that the read * is not blocked in the elevator. */ - if (!*bh) + if (!*bh) { *bh = sb_getblk(sb, mmp_block); - if (!*bh) - return -ENOMEM; - if (*bh) { - get_bh(*bh); - lock_buffer(*bh); - (*bh)->b_end_io = end_buffer_read_sync; - submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); - wait_on_buffer(*bh); - if (!buffer_uptodate(*bh)) { - brelse(*bh); - *bh = NULL; + if (!*bh) { + ret = -ENOMEM; + goto warn_exit; } } - if (unlikely(!*bh)) { - ext4_warning(sb, "Error while reading MMP block %llu", - mmp_block); - return -EIO; + + get_bh(*bh); + lock_buffer(*bh); + (*bh)->b_end_io = end_buffer_read_sync; + submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); + wait_on_buffer(*bh); + if (!buffer_uptodate(*bh)) { + brelse(*bh); + *bh = NULL; + ret = -EIO; + goto warn_exit; } mmp = (struct mmp_struct *)((*bh)->b_data); - if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC || - !ext4_mmp_csum_verify(sb, mmp)) - return -EINVAL; - - return 0; + if (le32_to_cpu(mmp->mmp_magic) == EXT4_MMP_MAGIC && + ext4_mmp_csum_verify(sb, mmp)) + return 0; + ret = -EINVAL; + +warn_exit: + ext4_warning(sb, "Error %d while reading MMP block %llu", + ret, mmp_block); + return ret; } /* @@ -111,7 +115,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, const char *function, unsigned int line, const char *msg) { - __ext4_warning(sb, function, line, msg); + __ext4_warning(sb, function, line, "%s", msg); __ext4_warning(sb, function, line, "MMP failure info: last update time: %llu, last update " "node: %s, last update device: %s\n", diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 011dcfb5cce3..9f61e7679a6d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2436,7 +2436,9 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct inode *inode; int err, credits, retries = 0; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + return err; credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); @@ -2470,7 +2472,9 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + return err; credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); @@ -2499,7 +2503,9 @@ static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err, retries = 0; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + return err; retry: inode = ext4_new_inode_start_handle(dir, mode, @@ -2612,7 +2618,9 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + return err; credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); @@ -2910,8 +2918,12 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go in * separate transaction */ - dquot_initialize(dir); - dquot_initialize(d_inode(dentry)); + retval = dquot_initialize(dir); + if (retval) + return retval; + retval = dquot_initialize(d_inode(dentry)); + if (retval) + return retval; retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); @@ -2980,8 +2992,12 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) trace_ext4_unlink_enter(dir, dentry); /* Initialize quotas before so that eventual writes go * in separate transaction */ - dquot_initialize(dir); - dquot_initialize(d_inode(dentry)); + retval = dquot_initialize(dir); + if (retval) + return retval; + retval = dquot_initialize(d_inode(dentry)); + if (retval) + return retval; retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); @@ -3066,7 +3082,9 @@ static int ext4_symlink(struct inode *dir, goto err_free_sd; } - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + goto err_free_sd; if ((disk_link.len > EXT4_N_BLOCKS * 4)) { /* @@ -3197,7 +3215,9 @@ static int ext4_link(struct dentry *old_dentry, if (ext4_encrypted_inode(dir) && !ext4_is_child_context_consistent_with_parent(dir, inode)) return -EPERM; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) + return err; retry: handle = ext4_journal_start(dir, EXT4_HT_DIR, @@ -3476,13 +3496,20 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, int credits; u8 old_file_type; - dquot_initialize(old.dir); - dquot_initialize(new.dir); + retval = dquot_initialize(old.dir); + if (retval) + return retval; + retval = dquot_initialize(new.dir); + if (retval) + return retval; /* Initialize quotas before so that eventual writes go * in separate transaction */ - if (new.inode) - dquot_initialize(new.inode); + if (new.inode) { + retval = dquot_initialize(new.inode); + if (retval) + return retval; + } old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL); if (IS_ERR(old.bh)) @@ -3678,8 +3705,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, new.inode))) return -EPERM; - dquot_initialize(old.dir); - dquot_initialize(new.dir); + retval = dquot_initialize(old.dir); + if (retval) + return retval; + retval = dquot_initialize(new.dir); + if (retval) + return retval; old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, &old.inlined); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 5602450f03f6..84ba4d2b3a35 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -61,7 +61,6 @@ static void buffer_io_error(struct buffer_head *bh) static void ext4_finish_bio(struct bio *bio) { int i; - int error = !test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec; bio_for_each_segment_all(bvec, bio, i) { @@ -88,7 +87,7 @@ static void ext4_finish_bio(struct bio *bio) } #endif - if (error) { + if (bio->bi_error) { SetPageError(page); set_bit(AS_EIO, &page->mapping->flags); } @@ -107,7 +106,7 @@ static void ext4_finish_bio(struct bio *bio) continue; } clear_buffer_async_write(bh); - if (error) + if (bio->bi_error) buffer_io_error(bh); } while ((bh = bh->b_this_page) != head); bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); @@ -310,27 +309,25 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) } /* BIO completion function for page writeback */ -static void ext4_end_bio(struct bio *bio, int error) +static void ext4_end_bio(struct bio *bio) { ext4_io_end_t *io_end = bio->bi_private; sector_t bi_sector = bio->bi_iter.bi_sector; BUG_ON(!io_end); bio->bi_end_io = NULL; - if (test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = 0; - if (error) { + if (bio->bi_error) { struct inode *inode = io_end->inode; ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " "(offset %llu size %ld starting block %llu)", - error, inode->i_ino, + bio->bi_error, inode->i_ino, (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); - mapping_set_error(inode->i_mapping, error); + mapping_set_error(inode->i_mapping, bio->bi_error); } if (io_end->flag & EXT4_IO_END_UNWRITTEN) { @@ -357,8 +354,10 @@ void ext4_io_submit(struct ext4_io_submit *io) struct bio *bio = io->io_bio; if (bio) { + int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : WRITE; bio_get(io->io_bio); - submit_bio(io->io_op, io->io_bio); + submit_bio(io_op, io->io_bio); bio_put(io->io_bio); } io->io_bio = NULL; @@ -367,7 +366,7 @@ void ext4_io_submit(struct ext4_io_submit *io) void ext4_io_submit_init(struct ext4_io_submit *io, struct writeback_control *wbc) { - io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + io->io_wbc = wbc; io->io_bio = NULL; io->io_end = NULL; } @@ -375,12 +374,12 @@ void ext4_io_submit_init(struct ext4_io_submit *io, static int io_submit_init_bio(struct ext4_io_submit *io, struct buffer_head *bh) { - int nvecs = bio_get_nr_vecs(bh->b_bdev); struct bio *bio; - bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); + bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); if (!bio) return -ENOMEM; + wbc_init_bio(io->io_wbc, bio); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_end_io = ext4_end_bio; @@ -409,6 +408,7 @@ submit_and_retry: ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) goto submit_and_retry; + wbc_account_io(io->io_wbc, page, bh->b_size); io->io_next_block++; return 0; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index ec3ef93a52db..e26803fb210d 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -98,7 +98,7 @@ static inline bool ext4_bio_encrypted(struct bio *bio) * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ -static void mpage_end_io(struct bio *bio, int err) +static void mpage_end_io(struct bio *bio) { struct bio_vec *bv; int i; @@ -106,7 +106,7 @@ static void mpage_end_io(struct bio *bio, int err) if (ext4_bio_encrypted(bio)) { struct ext4_crypto_ctx *ctx = bio->bi_private; - if (err) { + if (bio->bi_error) { ext4_release_crypto_ctx(ctx); } else { INIT_WORK(&ctx->r.work, completion_pages); @@ -118,7 +118,7 @@ static void mpage_end_io(struct bio *bio, int err) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - if (!err) { + if (!bio->bi_error) { SetPageUptodate(page); } else { ClearPageUptodate(page); @@ -284,7 +284,7 @@ int ext4_mpage_readpages(struct address_space *mapping, goto set_error_page; } bio = bio_alloc(GFP_KERNEL, - min_t(int, nr_pages, bio_get_nr_vecs(bdev))); + min_t(int, nr_pages, BIO_MAX_PAGES)); if (!bio) { if (ctx) ext4_release_crypto_ctx(ctx); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5c787647afe2..a63c7b0a10cf 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -60,6 +60,7 @@ static struct ext4_lazy_init *ext4_li_info; static struct mutex ext4_li_mtx; static struct ext4_features *ext4_feat; static int ext4_mballoc_ready; +static struct ratelimit_state ext4_mount_msg_ratelimit; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); @@ -84,7 +85,7 @@ static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t); -#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, .name = "ext2", @@ -100,7 +101,6 @@ MODULE_ALIAS("ext2"); #endif -#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static struct file_system_type ext3_fs_type = { .owner = THIS_MODULE, .name = "ext3", @@ -111,9 +111,6 @@ static struct file_system_type ext3_fs_type = { MODULE_ALIAS_FS("ext3"); MODULE_ALIAS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) -#else -#define IS_EXT3_SB(sb) (0) -#endif static int ext4_verify_csum_type(struct super_block *sb, struct ext4_super_block *es) @@ -325,6 +322,22 @@ static void save_error_info(struct super_block *sb, const char *func, ext4_commit_super(sb, 1); } +/* + * The del_gendisk() function uninitializes the disk-specific data + * structures, including the bdi structure, without telling anyone + * else. Once this happens, any attempt to call mark_buffer_dirty() + * (for example, by ext4_commit_super), will cause a kernel OOPS. + * This is a kludge to prevent these oops until we can put in a proper + * hook in del_gendisk() to inform the VFS and file system layers. + */ +static int block_device_ejected(struct super_block *sb) +{ + struct inode *bd_inode = sb->s_bdev->bd_inode; + struct backing_dev_info *bdi = inode_to_bdi(bd_inode); + + return bdi->dev == NULL; +} + static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) { struct super_block *sb = journal->j_private; @@ -452,7 +465,7 @@ void __ext4_error_file(struct file *file, const char *function, es = EXT4_SB(inode->i_sb)->s_es; es->s_last_error_ino = cpu_to_le32(inode->i_ino); if (ext4_error_ratelimit(inode->i_sb)) { - path = d_path(&(file->f_path), pathname, sizeof(pathname)); + path = file_path(file, pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; va_start(args, fmt); @@ -1394,9 +1407,9 @@ static const struct mount_opts { {Opt_stripe, 0, MOPT_GTE0}, {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, - {Opt_journal_dev, 0, MOPT_GTE0}, - {Opt_journal_path, 0, MOPT_STRING}, - {Opt_journal_ioprio, 0, MOPT_GTE0}, + {Opt_journal_dev, 0, MOPT_NO_EXT2 | MOPT_GTE0}, + {Opt_journal_path, 0, MOPT_NO_EXT2 | MOPT_STRING}, + {Opt_journal_ioprio, 0, MOPT_NO_EXT2 | MOPT_GTE0}, {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, @@ -1763,10 +1776,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq, } if (sbi->s_qf_names[USRQUOTA]) - seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); + seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); if (sbi->s_qf_names[GRPQUOTA]) - seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); + seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); #endif } @@ -3643,6 +3656,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); + } else { + sb->s_iflags |= SB_I_CGROUPWB; } sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | @@ -4275,9 +4290,10 @@ no_journal: "the device does not support discard"); } - ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " - "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, - *sbi->s_es->s_mount_opts ? "; " : "", orig_data); + if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) + ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " + "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, + *sbi->s_es->s_mount_opts ? "; " : "", orig_data); if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ @@ -4617,7 +4633,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; int error = 0; - if (!sbh) + if (!sbh || block_device_ejected(sb)) return error; if (buffer_write_io_error(sbh)) { /* @@ -4665,7 +4681,8 @@ static int ext4_commit_super(struct super_block *sb, int sync) ext4_superblock_csum_set(sb); mark_buffer_dirty(sbh); if (sync) { - error = sync_dirty_buffer(sbh); + error = __sync_dirty_buffer(sbh, + test_opt(sb, BARRIER) ? WRITE_FUA : WRITE_SYNC); if (error) return error; @@ -4833,10 +4850,11 @@ static int ext4_freeze(struct super_block *sb) error = jbd2_journal_flush(journal); if (error < 0) goto out; + + /* Journal blocked and flushed, clear needs_recovery flag. */ + EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); } - /* Journal blocked and flushed, clear needs_recovery flag. */ - EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); error = ext4_commit_super(sb, 1); out: if (journal) @@ -4854,8 +4872,11 @@ static int ext4_unfreeze(struct super_block *sb) if (sb->s_flags & MS_RDONLY) return 0; - /* Reset the needs_recovery flag before the fs is unlocked. */ - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + if (EXT4_SB(sb)->s_journal) { + /* Reset the needs_recovery flag before the fs is unlocked. */ + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + } + ext4_commit_super(sb, 1); return 0; } @@ -5500,7 +5521,7 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super); } -#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static inline void register_as_ext2(void) { int err = register_filesystem(&ext2_fs_type); @@ -5530,7 +5551,6 @@ static inline void unregister_as_ext2(void) { } static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; } #endif -#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static inline void register_as_ext3(void) { int err = register_filesystem(&ext3_fs_type); @@ -5556,11 +5576,6 @@ static inline int ext3_feature_set_ok(struct super_block *sb) return 0; return 1; } -#else -static inline void register_as_ext3(void) { } -static inline void unregister_as_ext3(void) { } -static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; } -#endif static struct file_system_type ext4_fs_type = { .owner = THIS_MODULE, @@ -5610,6 +5625,7 @@ static int __init ext4_init_fs(void) { int i, err; + ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64); ext4_li_info = NULL; mutex_init(&ext4_li_mtx); diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index c629762005bc..b0a9dc929f88 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -45,7 +45,7 @@ config F2FS_FS_POSIX_ACL default y help Posix Access Control Lists (ACLs) support permissions for users and - gourps beyond the owner/group/world scheme. + groups beyond the owner/group/world scheme. To learn more about Access Control Lists, visit the POSIX ACLs for Linux website <http://acl.bestbits.at/>. diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index 396be1a39e55..08e101ed914c 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -2,6 +2,7 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o +f2fs-y += shrinker.o extent_cache.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index b70bbe1a6a8c..c5a38e352a80 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -69,14 +69,24 @@ repeat: fio.page = page; - if (f2fs_submit_page_bio(&fio)) + if (f2fs_submit_page_bio(&fio)) { + f2fs_put_page(page, 1); goto repeat; + } lock_page(page); if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } + + /* + * if there is any IO error when accessing device, make our filesystem + * readonly and make sure do not write checkpoint with non-uptodate + * meta page. + */ + if (unlikely(!PageUptodate(page))) + f2fs_stop_checkpoint(sbi); out: return page; } @@ -326,26 +336,18 @@ const struct address_space_operations f2fs_meta_aops = { static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { struct inode_management *im = &sbi->im[type]; - struct ino_entry *e; + struct ino_entry *e, *tmp; + + tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS); retry: - if (radix_tree_preload(GFP_NOFS)) { - cond_resched(); - goto retry; - } + radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); spin_lock(&im->ino_lock); - e = radix_tree_lookup(&im->ino_root, ino); if (!e) { - e = kmem_cache_alloc(ino_entry_slab, GFP_ATOMIC); - if (!e) { - spin_unlock(&im->ino_lock); - radix_tree_preload_end(); - goto retry; - } + e = tmp; if (radix_tree_insert(&im->ino_root, ino, e)) { spin_unlock(&im->ino_lock); - kmem_cache_free(ino_entry_slab, e); radix_tree_preload_end(); goto retry; } @@ -358,6 +360,9 @@ retry: } spin_unlock(&im->ino_lock); radix_tree_preload_end(); + + if (e != tmp) + kmem_cache_free(ino_entry_slab, tmp); } static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) @@ -458,24 +463,34 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) __remove_ino_entry(sbi, ino, ORPHAN_INO); } -static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct inode *inode = f2fs_iget(sbi->sb, ino); - f2fs_bug_on(sbi, IS_ERR(inode)); + struct inode *inode; + + inode = f2fs_iget(sbi->sb, ino); + if (IS_ERR(inode)) { + /* + * there should be a bug that we can't find the entry + * to orphan inode. + */ + f2fs_bug_on(sbi, PTR_ERR(inode) == -ENOENT); + return PTR_ERR(inode); + } + clear_nlink(inode); /* truncate all the data during iput */ iput(inode); + return 0; } -void recover_orphan_inodes(struct f2fs_sb_info *sbi) +int recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blocks, i, j; + int err; if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) - return; - - set_sbi_flag(sbi, SBI_POR_DOING); + return 0; start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); @@ -489,14 +504,17 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi) orphan_blk = (struct f2fs_orphan_block *)page_address(page); for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) { nid_t ino = le32_to_cpu(orphan_blk->ino[j]); - recover_orphan_inode(sbi, ino); + err = recover_orphan_inode(sbi, ino); + if (err) { + f2fs_put_page(page, 1); + return err; + } } f2fs_put_page(page, 1); } /* clear Orphan Flag */ clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); - clear_sbi_flag(sbi, SBI_POR_DOING); - return; + return 0; } static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) @@ -504,7 +522,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) struct list_head *head; struct f2fs_orphan_block *orphan_blk = NULL; unsigned int nentries = 0; - unsigned short index; + unsigned short index = 1; unsigned short orphan_blocks; struct page *page = NULL; struct ino_entry *orphan = NULL; @@ -512,11 +530,6 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) orphan_blocks = GET_ORPHAN_BLOCKS(im->ino_num); - for (index = 0; index < orphan_blocks; index++) - grab_meta_page(sbi, start_blk + index); - - index = 1; - /* * we don't need to do spin_lock(&im->ino_lock) here, since all the * orphan inode operations are covered under f2fs_lock_op(). @@ -527,12 +540,10 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) /* loop for each orphan inode entry and write them in Jornal block */ list_for_each_entry(orphan, head, list) { if (!page) { - page = find_get_page(META_MAPPING(sbi), start_blk++); - f2fs_bug_on(sbi, !page); + page = grab_meta_page(sbi, start_blk++); orphan_blk = (struct f2fs_orphan_block *)page_address(page); memset(orphan_blk, 0, sizeof(*orphan_blk)); - f2fs_put_page(page, 0); } orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); @@ -704,7 +715,8 @@ void update_dirty_page(struct inode *inode, struct page *page) struct inode_entry *new; int ret = 0; - if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode)) + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) return; if (!S_ISDIR(inode->i_mode)) { @@ -892,12 +904,15 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) __u32 crc32 = 0; int i; int cp_payload_blks = __cp_payload(sbi); + block_t discard_blk = NEXT_FREE_BLKADDR(sbi, curseg); + bool invalidate = false; /* * This avoids to conduct wrong roll-forward operations and uses * metapages, so should be called prior to sync_meta_pages below. */ - discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg)); + if (discard_next_dnode(sbi, discard_blk)) + invalidate = true; /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { @@ -1026,6 +1041,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); + /* + * invalidate meta page which is used temporarily for zeroing out + * block at the end of warm node chain. + */ + if (invalidate) + invalidate_mapping_pages(META_MAPPING(sbi), discard_blk, + discard_blk); + release_dirty_inode(sbi); if (unlikely(f2fs_cp_error(sbi))) diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c index 95b8f936f00b..9f77de2ef317 100644 --- a/fs/f2fs/crypto_key.c +++ b/fs/f2fs/crypto_key.c @@ -92,8 +92,7 @@ static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci) if (!ci) return; - if (ci->ci_keyring_key) - key_put(ci->ci_keyring_key); + key_put(ci->ci_keyring_key); crypto_free_ablkcipher(ci->ci_ctfm); kmem_cache_free(f2fs_crypt_info_cachep, ci); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9bedfa8dd3a5..a82abe921b89 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -14,6 +14,7 @@ #include <linux/mpage.h> #include <linux/writeback.h> #include <linux/backing-dev.h> +#include <linux/pagevec.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/prefetch.h> @@ -26,16 +27,13 @@ #include "trace.h" #include <trace/events/f2fs.h> -static struct kmem_cache *extent_tree_slab; -static struct kmem_cache *extent_node_slab; - -static void f2fs_read_end_io(struct bio *bio, int err) +static void f2fs_read_end_io(struct bio *bio) { struct bio_vec *bvec; int i; if (f2fs_bio_encrypted(bio)) { - if (err) { + if (bio->bi_error) { f2fs_release_crypto_ctx(bio->bi_private); } else { f2fs_end_io_crypto_work(bio->bi_private, bio); @@ -46,7 +44,7 @@ static void f2fs_read_end_io(struct bio *bio, int err) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - if (!err) { + if (!bio->bi_error) { SetPageUptodate(page); } else { ClearPageUptodate(page); @@ -57,7 +55,7 @@ static void f2fs_read_end_io(struct bio *bio, int err) bio_put(bio); } -static void f2fs_write_end_io(struct bio *bio, int err) +static void f2fs_write_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = bio->bi_private; struct bio_vec *bvec; @@ -68,7 +66,7 @@ static void f2fs_write_end_io(struct bio *bio, int err) f2fs_restore_and_release_control_page(&page); - if (unlikely(err)) { + if (unlikely(bio->bi_error)) { set_page_dirty(page); set_bit(AS_EIO, &page->mapping->flags); f2fs_stop_checkpoint(sbi); @@ -92,8 +90,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, { struct bio *bio; - /* No failure on bio allocation */ - bio = bio_alloc(GFP_NOIO, npages); + bio = f2fs_bio_alloc(npages); bio->bi_bdev = sbi->sb->s_bdev; bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); @@ -158,7 +155,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { bio_put(bio); - f2fs_put_page(page, 1); return -EFAULT; } @@ -266,645 +262,17 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) return err; } -static bool lookup_extent_info(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - pgoff_t start_fofs, end_fofs; - block_t start_blkaddr; - - read_lock(&fi->ext_lock); - if (fi->ext.len == 0) { - read_unlock(&fi->ext_lock); - return false; - } - - stat_inc_total_hit(inode->i_sb); - - start_fofs = fi->ext.fofs; - end_fofs = fi->ext.fofs + fi->ext.len - 1; - start_blkaddr = fi->ext.blk; - - if (pgofs >= start_fofs && pgofs <= end_fofs) { - *ei = fi->ext; - stat_inc_read_hit(inode->i_sb); - read_unlock(&fi->ext_lock); - return true; - } - read_unlock(&fi->ext_lock); - return false; -} - -static bool update_extent_info(struct inode *inode, pgoff_t fofs, - block_t blkaddr) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - pgoff_t start_fofs, end_fofs; - block_t start_blkaddr, end_blkaddr; - int need_update = true; - - write_lock(&fi->ext_lock); - - start_fofs = fi->ext.fofs; - end_fofs = fi->ext.fofs + fi->ext.len - 1; - start_blkaddr = fi->ext.blk; - end_blkaddr = fi->ext.blk + fi->ext.len - 1; - - /* Drop and initialize the matched extent */ - if (fi->ext.len == 1 && fofs == start_fofs) - fi->ext.len = 0; - - /* Initial extent */ - if (fi->ext.len == 0) { - if (blkaddr != NULL_ADDR) { - fi->ext.fofs = fofs; - fi->ext.blk = blkaddr; - fi->ext.len = 1; - } - goto end_update; - } - - /* Front merge */ - if (fofs == start_fofs - 1 && blkaddr == start_blkaddr - 1) { - fi->ext.fofs--; - fi->ext.blk--; - fi->ext.len++; - goto end_update; - } - - /* Back merge */ - if (fofs == end_fofs + 1 && blkaddr == end_blkaddr + 1) { - fi->ext.len++; - goto end_update; - } - - /* Split the existing extent */ - if (fi->ext.len > 1 && - fofs >= start_fofs && fofs <= end_fofs) { - if ((end_fofs - fofs) < (fi->ext.len >> 1)) { - fi->ext.len = fofs - start_fofs; - } else { - fi->ext.fofs = fofs + 1; - fi->ext.blk = start_blkaddr + fofs - start_fofs + 1; - fi->ext.len -= fofs - start_fofs + 1; - } - } else { - need_update = false; - } - - /* Finally, if the extent is very fragmented, let's drop the cache. */ - if (fi->ext.len < F2FS_MIN_EXTENT_LEN) { - fi->ext.len = 0; - set_inode_flag(fi, FI_NO_EXTENT); - need_update = true; - } -end_update: - write_unlock(&fi->ext_lock); - return need_update; -} - -static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_info *ei, - struct rb_node *parent, struct rb_node **p) -{ - struct extent_node *en; - - en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC); - if (!en) - return NULL; - - en->ei = *ei; - INIT_LIST_HEAD(&en->list); - - rb_link_node(&en->rb_node, parent, p); - rb_insert_color(&en->rb_node, &et->root); - et->count++; - atomic_inc(&sbi->total_ext_node); - return en; -} - -static void __detach_extent_node(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_node *en) -{ - rb_erase(&en->rb_node, &et->root); - et->count--; - atomic_dec(&sbi->total_ext_node); - - if (et->cached_en == en) - et->cached_en = NULL; -} - -static struct extent_tree *__find_extent_tree(struct f2fs_sb_info *sbi, - nid_t ino) -{ - struct extent_tree *et; - - down_read(&sbi->extent_tree_lock); - et = radix_tree_lookup(&sbi->extent_tree_root, ino); - if (!et) { - up_read(&sbi->extent_tree_lock); - return NULL; - } - atomic_inc(&et->refcount); - up_read(&sbi->extent_tree_lock); - - return et; -} - -static struct extent_tree *__grab_extent_tree(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et; - nid_t ino = inode->i_ino; - - down_write(&sbi->extent_tree_lock); - et = radix_tree_lookup(&sbi->extent_tree_root, ino); - if (!et) { - et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS); - f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); - memset(et, 0, sizeof(struct extent_tree)); - et->ino = ino; - et->root = RB_ROOT; - et->cached_en = NULL; - rwlock_init(&et->lock); - atomic_set(&et->refcount, 0); - et->count = 0; - sbi->total_ext_tree++; - } - atomic_inc(&et->refcount); - up_write(&sbi->extent_tree_lock); - - return et; -} - -static struct extent_node *__lookup_extent_tree(struct extent_tree *et, - unsigned int fofs) -{ - struct rb_node *node = et->root.rb_node; - struct extent_node *en; - - if (et->cached_en) { - struct extent_info *cei = &et->cached_en->ei; - - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) - return et->cached_en; - } - - while (node) { - en = rb_entry(node, struct extent_node, rb_node); - - if (fofs < en->ei.fofs) { - node = node->rb_left; - } else if (fofs >= en->ei.fofs + en->ei.len) { - node = node->rb_right; - } else { - et->cached_en = en; - return en; - } - } - return NULL; -} - -static struct extent_node *__try_back_merge(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_node *en) -{ - struct extent_node *prev; - struct rb_node *node; - - node = rb_prev(&en->rb_node); - if (!node) - return NULL; - - prev = rb_entry(node, struct extent_node, rb_node); - if (__is_back_mergeable(&en->ei, &prev->ei)) { - en->ei.fofs = prev->ei.fofs; - en->ei.blk = prev->ei.blk; - en->ei.len += prev->ei.len; - __detach_extent_node(sbi, et, prev); - return prev; - } - return NULL; -} - -static struct extent_node *__try_front_merge(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_node *en) -{ - struct extent_node *next; - struct rb_node *node; - - node = rb_next(&en->rb_node); - if (!node) - return NULL; - - next = rb_entry(node, struct extent_node, rb_node); - if (__is_front_mergeable(&en->ei, &next->ei)) { - en->ei.len += next->ei.len; - __detach_extent_node(sbi, et, next); - return next; - } - return NULL; -} - -static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_info *ei, - struct extent_node **den) -{ - struct rb_node **p = &et->root.rb_node; - struct rb_node *parent = NULL; - struct extent_node *en; - - while (*p) { - parent = *p; - en = rb_entry(parent, struct extent_node, rb_node); - - if (ei->fofs < en->ei.fofs) { - if (__is_front_mergeable(ei, &en->ei)) { - f2fs_bug_on(sbi, !den); - en->ei.fofs = ei->fofs; - en->ei.blk = ei->blk; - en->ei.len += ei->len; - *den = __try_back_merge(sbi, et, en); - return en; - } - p = &(*p)->rb_left; - } else if (ei->fofs >= en->ei.fofs + en->ei.len) { - if (__is_back_mergeable(ei, &en->ei)) { - f2fs_bug_on(sbi, !den); - en->ei.len += ei->len; - *den = __try_front_merge(sbi, et, en); - return en; - } - p = &(*p)->rb_right; - } else { - f2fs_bug_on(sbi, 1); - } - } - - return __attach_extent_node(sbi, et, ei, parent, p); -} - -static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, bool free_all) +int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) { - struct rb_node *node, *next; - struct extent_node *en; - unsigned int count = et->count; - - node = rb_first(&et->root); - while (node) { - next = rb_next(node); - en = rb_entry(node, struct extent_node, rb_node); - - if (free_all) { - spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) - list_del_init(&en->list); - spin_unlock(&sbi->extent_lock); - } - - if (free_all || list_empty(&en->list)) { - __detach_extent_node(sbi, et, en); - kmem_cache_free(extent_node_slab, en); - } - node = next; - } - - return count - et->count; -} - -static void f2fs_init_extent_tree(struct inode *inode, - struct f2fs_extent *i_ext) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et; - struct extent_node *en; struct extent_info ei; + struct inode *inode = dn->inode; - if (le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN) - return; - - et = __grab_extent_tree(inode); - - write_lock(&et->lock); - if (et->count) - goto out; - - set_extent_info(&ei, le32_to_cpu(i_ext->fofs), - le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len)); - - en = __insert_extent_tree(sbi, et, &ei, NULL); - if (en) { - et->cached_en = en; - - spin_lock(&sbi->extent_lock); - list_add_tail(&en->list, &sbi->extent_list); - spin_unlock(&sbi->extent_lock); - } -out: - write_unlock(&et->lock); - atomic_dec(&et->refcount); -} - -static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et; - struct extent_node *en; - - trace_f2fs_lookup_extent_tree_start(inode, pgofs); - - et = __find_extent_tree(sbi, inode->i_ino); - if (!et) - return false; - - read_lock(&et->lock); - en = __lookup_extent_tree(et, pgofs); - if (en) { - *ei = en->ei; - spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) - list_move_tail(&en->list, &sbi->extent_list); - spin_unlock(&sbi->extent_lock); - stat_inc_read_hit(sbi->sb); - } - stat_inc_total_hit(sbi->sb); - read_unlock(&et->lock); - - trace_f2fs_lookup_extent_tree_end(inode, pgofs, en); - - atomic_dec(&et->refcount); - return en ? true : false; -} - -static void f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs, - block_t blkaddr) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et; - struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL; - struct extent_node *den = NULL; - struct extent_info ei, dei; - unsigned int endofs; - - trace_f2fs_update_extent_tree(inode, fofs, blkaddr); - - et = __grab_extent_tree(inode); - - write_lock(&et->lock); - - /* 1. lookup and remove existing extent info in cache */ - en = __lookup_extent_tree(et, fofs); - if (!en) - goto update_extent; - - dei = en->ei; - __detach_extent_node(sbi, et, en); - - /* 2. if extent can be split more, split and insert the left part */ - if (dei.len > 1) { - /* insert left part of split extent into cache */ - if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) { - set_extent_info(&ei, dei.fofs, dei.blk, - fofs - dei.fofs); - en1 = __insert_extent_tree(sbi, et, &ei, NULL); - } - - /* insert right part of split extent into cache */ - endofs = dei.fofs + dei.len - 1; - if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) { - set_extent_info(&ei, fofs + 1, - fofs - dei.fofs + dei.blk, endofs - fofs); - en2 = __insert_extent_tree(sbi, et, &ei, NULL); - } - } - -update_extent: - /* 3. update extent in extent cache */ - if (blkaddr) { - set_extent_info(&ei, fofs, blkaddr, 1); - en3 = __insert_extent_tree(sbi, et, &ei, &den); - } - - /* 4. update in global extent list */ - spin_lock(&sbi->extent_lock); - if (en && !list_empty(&en->list)) - list_del(&en->list); - /* - * en1 and en2 split from en, they will become more and more smaller - * fragments after splitting several times. So if the length is smaller - * than F2FS_MIN_EXTENT_LEN, we will not add them into extent tree. - */ - if (en1) - list_add_tail(&en1->list, &sbi->extent_list); - if (en2) - list_add_tail(&en2->list, &sbi->extent_list); - if (en3) { - if (list_empty(&en3->list)) - list_add_tail(&en3->list, &sbi->extent_list); - else - list_move_tail(&en3->list, &sbi->extent_list); - } - if (den && !list_empty(&den->list)) - list_del(&den->list); - spin_unlock(&sbi->extent_lock); - - /* 5. release extent node */ - if (en) - kmem_cache_free(extent_node_slab, en); - if (den) - kmem_cache_free(extent_node_slab, den); - - write_unlock(&et->lock); - atomic_dec(&et->refcount); -} - -void f2fs_preserve_extent_tree(struct inode *inode) -{ - struct extent_tree *et; - struct extent_info *ext = &F2FS_I(inode)->ext; - bool sync = false; - - if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE)) - return; - - et = __find_extent_tree(F2FS_I_SB(inode), inode->i_ino); - if (!et) { - if (ext->len) { - ext->len = 0; - update_inode_page(inode); - } - return; - } - - read_lock(&et->lock); - if (et->count) { - struct extent_node *en; - - if (et->cached_en) { - en = et->cached_en; - } else { - struct rb_node *node = rb_first(&et->root); - - if (!node) - node = rb_last(&et->root); - en = rb_entry(node, struct extent_node, rb_node); - } - - if (__is_extent_same(ext, &en->ei)) - goto out; - - *ext = en->ei; - sync = true; - } else if (ext->len) { - ext->len = 0; - sync = true; - } -out: - read_unlock(&et->lock); - atomic_dec(&et->refcount); - - if (sync) - update_inode_page(inode); -} - -void f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) -{ - struct extent_tree *treevec[EXT_TREE_VEC_SIZE]; - struct extent_node *en, *tmp; - unsigned long ino = F2FS_ROOT_INO(sbi); - struct radix_tree_iter iter; - void **slot; - unsigned int found; - unsigned int node_cnt = 0, tree_cnt = 0; - - if (!test_opt(sbi, EXTENT_CACHE)) - return; - - if (available_free_memory(sbi, EXTENT_CACHE)) - return; - - spin_lock(&sbi->extent_lock); - list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) { - if (!nr_shrink--) - break; - list_del_init(&en->list); - } - spin_unlock(&sbi->extent_lock); - - down_read(&sbi->extent_tree_lock); - while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root, - (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { - unsigned i; - - ino = treevec[found - 1]->ino + 1; - for (i = 0; i < found; i++) { - struct extent_tree *et = treevec[i]; - - atomic_inc(&et->refcount); - write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et, false); - write_unlock(&et->lock); - atomic_dec(&et->refcount); - } - } - up_read(&sbi->extent_tree_lock); - - down_write(&sbi->extent_tree_lock); - radix_tree_for_each_slot(slot, &sbi->extent_tree_root, &iter, - F2FS_ROOT_INO(sbi)) { - struct extent_tree *et = (struct extent_tree *)*slot; - - if (!atomic_read(&et->refcount) && !et->count) { - radix_tree_delete(&sbi->extent_tree_root, et->ino); - kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; - tree_cnt++; - } - } - up_write(&sbi->extent_tree_lock); - - trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); -} - -void f2fs_destroy_extent_tree(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et; - unsigned int node_cnt = 0; - - if (!test_opt(sbi, EXTENT_CACHE)) - return; - - et = __find_extent_tree(sbi, inode->i_ino); - if (!et) - goto out; - - /* free all extent info belong to this extent tree */ - write_lock(&et->lock); - node_cnt = __free_extent_tree(sbi, et, true); - write_unlock(&et->lock); - - atomic_dec(&et->refcount); - - /* try to find and delete extent tree entry in radix tree */ - down_write(&sbi->extent_tree_lock); - et = radix_tree_lookup(&sbi->extent_tree_root, inode->i_ino); - if (!et) { - up_write(&sbi->extent_tree_lock); - goto out; + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn->data_blkaddr = ei.blk + index - ei.fofs; + return 0; } - f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count); - radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); - kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; - up_write(&sbi->extent_tree_lock); -out: - trace_f2fs_destroy_extent_tree(inode, node_cnt); - return; -} - -void f2fs_init_extent_cache(struct inode *inode, struct f2fs_extent *i_ext) -{ - if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE)) - f2fs_init_extent_tree(inode, i_ext); - - write_lock(&F2FS_I(inode)->ext_lock); - get_extent_info(&F2FS_I(inode)->ext, *i_ext); - write_unlock(&F2FS_I(inode)->ext_lock); -} -static bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei) -{ - if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) - return false; - - if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE)) - return f2fs_lookup_extent_tree(inode, pgofs, ei); - - return lookup_extent_info(inode, pgofs, ei); -} - -void f2fs_update_extent_cache(struct dnode_of_data *dn) -{ - struct f2fs_inode_info *fi = F2FS_I(dn->inode); - pgoff_t fofs; - - f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR); - - if (is_inode_flag_set(fi, FI_NO_EXTENT)) - return; - - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + - dn->ofs_in_node; - - if (test_opt(F2FS_I_SB(dn->inode), EXTENT_CACHE)) - return f2fs_update_extent_tree(dn->inode, fofs, - dn->data_blkaddr); - - if (update_extent_info(dn->inode, fofs, dn->data_blkaddr)) - sync_inode_page(dn); + return f2fs_reserve_block(dn, index); } struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw) @@ -935,15 +303,13 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw) set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, index, LOOKUP_NODE); - if (err) { - f2fs_put_page(page, 1); - return ERR_PTR(err); - } + if (err) + goto put_err; f2fs_put_dnode(&dn); if (unlikely(dn.data_blkaddr == NULL_ADDR)) { - f2fs_put_page(page, 1); - return ERR_PTR(-ENOENT); + err = -ENOENT; + goto put_err; } got_it: if (PageUptodate(page)) { @@ -968,8 +334,12 @@ got_it: fio.page = page; err = f2fs_submit_page_bio(&fio); if (err) - return ERR_PTR(err); + goto put_err; return page; + +put_err: + f2fs_put_page(page, 1); + return ERR_PTR(err); } struct page *find_data_page(struct inode *inode, pgoff_t index) @@ -1030,7 +400,8 @@ repeat: * * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). - * Note that, ipage is set only by make_empty_dir. + * Note that, ipage is set only by make_empty_dir, and if any error occur, + * ipage should be released by this function. */ struct page *get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size) @@ -1041,8 +412,14 @@ struct page *get_new_data_page(struct inode *inode, int err; repeat: page = grab_cache_page(mapping, index); - if (!page) + if (!page) { + /* + * before exiting, we should make sure ipage will be released + * if any error occur. + */ + f2fs_put_page(ipage, 1); return ERR_PTR(-ENOMEM); + } set_new_dnode(&dn, inode, ipage, NULL, 0); err = f2fs_reserve_block(&dn, index); @@ -1107,8 +484,6 @@ alloc: allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, &sum, seg); - - /* direct IO doesn't use extent cache to maximize the performance */ set_data_blkaddr(dn); /* update i_size */ @@ -1117,6 +492,9 @@ alloc: if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT)) i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT)); + /* direct IO doesn't use extent cache to maximize the performance */ + f2fs_drop_largest_extent(dn->inode, fofs); + return 0; } @@ -1183,7 +561,7 @@ out: * c. give the block addresses to blockdev */ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, - int create, bool fiemap) + int create, int flag) { unsigned int maxblocks = map->m_len; struct dnode_of_data dn; @@ -1217,8 +595,19 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR && !fiemap) - goto put_out; + if (dn.data_blkaddr == NEW_ADDR) { + if (flag == F2FS_GET_BLOCK_BMAP) { + err = -ENOENT; + goto put_out; + } else if (flag == F2FS_GET_BLOCK_READ || + flag == F2FS_GET_BLOCK_DIO) { + goto put_out; + } + /* + * if it is in fiemap call path (flag = F2FS_GET_BLOCK_FIEMAP), + * mark it as mapped and unwritten block. + */ + } if (dn.data_blkaddr != NULL_ADDR) { map->m_flags = F2FS_MAP_MAPPED; @@ -1233,6 +622,8 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, map->m_flags = F2FS_MAP_NEW | F2FS_MAP_MAPPED; map->m_pblk = dn.data_blkaddr; } else { + if (flag == F2FS_GET_BLOCK_BMAP) + err = -ENOENT; goto put_out; } @@ -1255,7 +646,9 @@ get_next: err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR && !fiemap) + + if (dn.data_blkaddr == NEW_ADDR && + flag != F2FS_GET_BLOCK_FIEMAP) goto put_out; end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); @@ -1297,7 +690,7 @@ out: } static int __get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create, bool fiemap) + struct buffer_head *bh, int create, int flag) { struct f2fs_map_blocks map; int ret; @@ -1305,7 +698,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock, map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; - ret = f2fs_map_blocks(inode, &map, create, fiemap); + ret = f2fs_map_blocks(inode, &map, create, flag); if (!ret) { map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; @@ -1315,15 +708,23 @@ static int __get_data_block(struct inode *inode, sector_t iblock, } static int get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create, int flag) +{ + return __get_data_block(inode, iblock, bh_result, create, flag); +} + +static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - return __get_data_block(inode, iblock, bh_result, create, false); + return __get_data_block(inode, iblock, bh_result, create, + F2FS_GET_BLOCK_DIO); } -static int get_data_block_fiemap(struct inode *inode, sector_t iblock, +static int get_data_block_bmap(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - return __get_data_block(inode, iblock, bh_result, create, true); + return __get_data_block(inode, iblock, bh_result, create, + F2FS_GET_BLOCK_BMAP); } static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) @@ -1367,7 +768,8 @@ next: memset(&map_bh, 0, sizeof(struct buffer_head)); map_bh.b_size = len; - ret = get_data_block_fiemap(inode, start_blk, &map_bh, 0); + ret = get_data_block(inode, start_blk, &map_bh, 0, + F2FS_GET_BLOCK_FIEMAP); if (ret) goto out; @@ -1552,7 +954,7 @@ submit_and_realloc: } bio = bio_alloc(GFP_KERNEL, - min_t(int, nr_pages, bio_get_nr_vecs(bdev))); + min_t(int, nr_pages, BIO_MAX_PAGES)); if (!bio) { if (ctx) f2fs_release_crypto_ctx(ctx); @@ -1770,6 +1172,137 @@ static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, return ret; } +/* + * This function was copied from write_cche_pages from mm/page-writeback.c. + * The major change is making write step of cold data page separately from + * warm/hot data page. + */ +static int f2fs_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, writepage_t writepage, + void *data) +{ + int ret = 0; + int done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t uninitialized_var(writeback_index); + pgoff_t index; + pgoff_t end; /* Inclusive */ + pgoff_t done_index; + int cycled; + int range_whole = 0; + int tag; + int step = 0; + + pagevec_init(&pvec, 0); +next: + if (wbc->range_cyclic) { + writeback_index = mapping->writeback_index; /* prev offset */ + index = writeback_index; + if (index == 0) + cycled = 1; + else + cycled = 0; + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + cycled = 1; /* ignore range_cyclic tests */ + } + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; +retry: + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, index, end); + done_index = index; + while (!done && (index <= end)) { + int i; + + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (page->index > end) { + done = 1; + break; + } + + done_index = page->index; + + lock_page(page); + + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (step == is_cold_data(page)) + goto continue_unlock; + + if (PageWriteback(page)) { + if (wbc->sync_mode != WB_SYNC_NONE) + f2fs_wait_on_page_writeback(page, DATA); + else + goto continue_unlock; + } + + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + ret = (*writepage)(page, wbc, data); + if (unlikely(ret)) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + } else { + done_index = page->index + 1; + done = 1; + break; + } + } + + if (--wbc->nr_to_write <= 0 && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; + } + } + pagevec_release(&pvec); + cond_resched(); + } + + if (step < 1) { + step++; + goto next; + } + + if (!cycled && !done) { + cycled = 1; + index = 0; + end = writeback_index - 1; + goto retry; + } + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = done_index; + + return ret; +} + static int f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1785,6 +1318,10 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (!mapping->a_ops->writepage) return 0; + /* skip writing if there is no dirty page in this inode */ + if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) + return 0; + if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && available_free_memory(sbi, DIRTY_DENTS)) @@ -1800,12 +1337,11 @@ static int f2fs_write_data_pages(struct address_space *mapping, mutex_lock(&sbi->writepages); locked = true; } - ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); + ret = f2fs_write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); + f2fs_submit_merged_bio(sbi, DATA, WRITE); if (locked) mutex_unlock(&sbi->writepages); - f2fs_submit_merged_bio(sbi, DATA, WRITE); - remove_dirty_dir_inode(inode); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); @@ -1832,7 +1368,8 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct page *page, *ipage; + struct page *page = NULL; + struct page *ipage; pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; struct dnode_of_data dn; int err = 0; @@ -1882,25 +1419,28 @@ repeat: if (err) goto put_fail; } - err = f2fs_reserve_block(&dn, index); + + err = f2fs_get_block(&dn, index); if (err) goto put_fail; put_next: f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); - if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) - return 0; - f2fs_wait_on_page_writeback(page, DATA); + if (len == PAGE_CACHE_SIZE) + goto out_update; + if (PageUptodate(page)) + goto out_clear; + if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { unsigned start = pos & (PAGE_CACHE_SIZE - 1); unsigned end = start + len; /* Reading beyond i_size is simple: memset to zero */ zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); - goto out; + goto out_update; } if (dn.data_blkaddr == NEW_ADDR) { @@ -1920,7 +1460,6 @@ put_next: lock_page(page); if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 1); err = -EIO; goto fail; } @@ -1932,14 +1471,13 @@ put_next: /* avoid symlink page */ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { err = f2fs_decrypt_one(inode, page); - if (err) { - f2fs_put_page(page, 1); + if (err) goto fail; - } } } -out: +out_update: SetPageUptodate(page); +out_clear: clear_cold_data(page); return 0; @@ -1947,8 +1485,8 @@ put_fail: f2fs_put_dnode(&dn); unlock_fail: f2fs_unlock_op(sbi); - f2fs_put_page(page, 1); fail: + f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); return err; } @@ -1979,9 +1517,6 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter, { unsigned blocksize_mask = inode->i_sb->s_blocksize - 1; - if (iov_iter_rw(iter) == READ) - return 0; - if (offset & blocksize_mask) return -EINVAL; @@ -2010,15 +1545,16 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) return 0; - if (check_direct_IO(inode, iter, offset)) - return 0; + err = check_direct_IO(inode, iter, offset); + if (err) + return err; trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); if (iov_iter_rw(iter) == WRITE) __allocate_data_blocks(inode, offset, count); - err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block); + err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); if (err < 0 && iov_iter_rw(iter) == WRITE) f2fs_write_failed(mapping, offset + count); @@ -2045,6 +1581,11 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, else inode_dec_dirty_pages(inode); } + + /* This is atomic written page, keep Private */ + if (IS_ATOMIC_WRITTEN_PAGE(page)) + return; + ClearPagePrivate(page); } @@ -2054,6 +1595,10 @@ int f2fs_release_page(struct page *page, gfp_t wait) if (PageDirty(page)) return 0; + /* This is atomic written page, keep Private */ + if (IS_ATOMIC_WRITTEN_PAGE(page)) + return 0; + ClearPagePrivate(page); return 1; } @@ -2068,12 +1613,17 @@ static int f2fs_set_data_page_dirty(struct page *page) SetPageUptodate(page); if (f2fs_is_atomic_file(inode)) { - register_inmem_page(inode, page); - return 1; + if (!IS_ATOMIC_WRITTEN_PAGE(page)) { + register_inmem_page(inode, page); + return 1; + } + /* + * Previously, this page has been registered, we just + * return here. + */ + return 0; } - mark_inode_dirty(inode); - if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); update_dirty_page(inode, page); @@ -2092,38 +1642,7 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) if (err) return err; } - return generic_block_bmap(mapping, block, get_data_block); -} - -void init_extent_cache_info(struct f2fs_sb_info *sbi) -{ - INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); - init_rwsem(&sbi->extent_tree_lock); - INIT_LIST_HEAD(&sbi->extent_list); - spin_lock_init(&sbi->extent_lock); - sbi->total_ext_tree = 0; - atomic_set(&sbi->total_ext_node, 0); -} - -int __init create_extent_cache(void) -{ - extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree", - sizeof(struct extent_tree)); - if (!extent_tree_slab) - return -ENOMEM; - extent_node_slab = f2fs_kmem_cache_create("f2fs_extent_node", - sizeof(struct extent_node)); - if (!extent_node_slab) { - kmem_cache_destroy(extent_tree_slab); - return -ENOMEM; - } - return 0; -} - -void destroy_extent_cache(void) -{ - kmem_cache_destroy(extent_node_slab); - kmem_cache_destroy(extent_tree_slab); + return generic_block_bmap(mapping, block, get_data_block_bmap); } const struct address_space_operations f2fs_dblock_aops = { diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 75176e0dd6c8..d013d8479753 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -33,8 +33,11 @@ static void update_general_status(struct f2fs_sb_info *sbi) int i; /* validation check of the segment numbers */ - si->hit_ext = sbi->read_hit_ext; - si->total_ext = sbi->total_hit_ext; + si->hit_largest = atomic_read(&sbi->read_hit_largest); + si->hit_cached = atomic_read(&sbi->read_hit_cached); + si->hit_rbtree = atomic_read(&sbi->read_hit_rbtree); + si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree; + si->total_ext = atomic_read(&sbi->total_hit_ext); si->ext_tree = sbi->total_ext_tree; si->ext_node = atomic_read(&sbi->total_ext_node); si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); @@ -49,6 +52,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->valid_count = valid_user_blocks(sbi); si->valid_node_count = valid_node_count(sbi); si->valid_inode_count = valid_inode_count(sbi); + si->inline_xattr = atomic_read(&sbi->inline_xattr); si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); si->utilization = utilization(sbi); @@ -226,6 +230,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "Other: %u)\n - Data: %u\n", si->valid_node_count - si->valid_inode_count, si->valid_count - si->valid_node_count); + seq_printf(s, " - Inline_xattr Inode: %u\n", + si->inline_xattr); seq_printf(s, " - Inline_data Inode: %u\n", si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", @@ -276,10 +282,16 @@ static int stat_show(struct seq_file *s, void *v) si->bg_data_blks); seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, si->bg_node_blks); - seq_printf(s, "\nExtent Hit Ratio: %d / %d\n", - si->hit_ext, si->total_ext); - seq_printf(s, "\nExtent Tree Count: %d\n", si->ext_tree); - seq_printf(s, "\nExtent Node Count: %d\n", si->ext_node); + seq_puts(s, "\nExtent Cache:\n"); + seq_printf(s, " - Hit Count: L1-1:%d L1-2:%d L2:%d\n", + si->hit_largest, si->hit_cached, + si->hit_rbtree); + seq_printf(s, " - Hit Ratio: %d%% (%d / %d)\n", + !si->total_ext ? 0 : + (si->hit_total * 100) / si->total_ext, + si->hit_total, si->total_ext); + seq_printf(s, " - Inner Struct Count: tree: %d, node: %d\n", + si->ext_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - inmem: %4d, wb: %4d\n", si->inmem_pages, si->wb_pages); @@ -366,6 +378,12 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) si->sbi = sbi; sbi->stat_info = si; + atomic_set(&sbi->total_hit_ext, 0); + atomic_set(&sbi->read_hit_rbtree, 0); + atomic_set(&sbi->read_hit_largest, 0); + atomic_set(&sbi->read_hit_cached, 0); + + atomic_set(&sbi->inline_xattr, 0); atomic_set(&sbi->inline_inode, 0); atomic_set(&sbi->inline_dir, 0); atomic_set(&sbi->inplace_count, 0); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index a34ebd8312ab..8f15fc134040 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -718,8 +718,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (inode) f2fs_drop_nlink(dir, inode, NULL); - if (bit_pos == NR_DENTRY_IN_BLOCK) { - truncate_hole(dir, page->index, page->index + 1); + if (bit_pos == NR_DENTRY_IN_BLOCK && + !truncate_hole(dir, page->index, page->index + 1)) { clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c new file mode 100644 index 000000000000..997ac86f2a1d --- /dev/null +++ b/fs/f2fs/extent_cache.c @@ -0,0 +1,791 @@ +/* + * f2fs extent cache support + * + * Copyright (c) 2015 Motorola Mobility + * Copyright (c) 2015 Samsung Electronics + * Authors: Jaegeuk Kim <jaegeuk@kernel.org> + * Chao Yu <chao2.yu@samsung.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/f2fs_fs.h> + +#include "f2fs.h" +#include "node.h" +#include <trace/events/f2fs.h> + +static struct kmem_cache *extent_tree_slab; +static struct kmem_cache *extent_node_slab; + +static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct rb_node *parent, struct rb_node **p) +{ + struct extent_node *en; + + en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC); + if (!en) + return NULL; + + en->ei = *ei; + INIT_LIST_HEAD(&en->list); + + rb_link_node(&en->rb_node, parent, p); + rb_insert_color(&en->rb_node, &et->root); + et->count++; + atomic_inc(&sbi->total_ext_node); + return en; +} + +static void __detach_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + rb_erase(&en->rb_node, &et->root); + et->count--; + atomic_dec(&sbi->total_ext_node); + + if (et->cached_en == en) + et->cached_en = NULL; +} + +static struct extent_tree *__grab_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + nid_t ino = inode->i_ino; + + down_write(&sbi->extent_tree_lock); + et = radix_tree_lookup(&sbi->extent_tree_root, ino); + if (!et) { + et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS); + f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); + memset(et, 0, sizeof(struct extent_tree)); + et->ino = ino; + et->root = RB_ROOT; + et->cached_en = NULL; + rwlock_init(&et->lock); + atomic_set(&et->refcount, 0); + et->count = 0; + sbi->total_ext_tree++; + } + atomic_inc(&et->refcount); + up_write(&sbi->extent_tree_lock); + + /* never died until evict_inode */ + F2FS_I(inode)->extent_tree = et; + + return et; +} + +static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, unsigned int fofs) +{ + struct rb_node *node = et->root.rb_node; + struct extent_node *en = et->cached_en; + + if (en) { + struct extent_info *cei = &en->ei; + + if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) { + stat_inc_cached_node_hit(sbi); + return en; + } + } + + while (node) { + en = rb_entry(node, struct extent_node, rb_node); + + if (fofs < en->ei.fofs) { + node = node->rb_left; + } else if (fofs >= en->ei.fofs + en->ei.len) { + node = node->rb_right; + } else { + stat_inc_rbtree_node_hit(sbi); + return en; + } + } + return NULL; +} + +static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei) +{ + struct rb_node **p = &et->root.rb_node; + struct extent_node *en; + + en = __attach_extent_node(sbi, et, ei, NULL, p); + if (!en) + return NULL; + + et->largest = en->ei; + et->cached_en = en; + return en; +} + +static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, bool free_all) +{ + struct rb_node *node, *next; + struct extent_node *en; + unsigned int count = et->count; + + node = rb_first(&et->root); + while (node) { + next = rb_next(node); + en = rb_entry(node, struct extent_node, rb_node); + + if (free_all) { + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) + list_del_init(&en->list); + spin_unlock(&sbi->extent_lock); + } + + if (free_all || list_empty(&en->list)) { + __detach_extent_node(sbi, et, en); + kmem_cache_free(extent_node_slab, en); + } + node = next; + } + + return count - et->count; +} + +static void __drop_largest_extent(struct inode *inode, pgoff_t fofs) +{ + struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest; + + if (largest->fofs <= fofs && largest->fofs + largest->len > fofs) + largest->len = 0; +} + +void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs) +{ + if (!f2fs_may_extent_tree(inode)) + return; + + __drop_largest_extent(inode, fofs); +} + +void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + struct extent_node *en; + struct extent_info ei; + + if (!f2fs_may_extent_tree(inode)) + return; + + et = __grab_extent_tree(inode); + + if (!i_ext || le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN) + return; + + set_extent_info(&ei, le32_to_cpu(i_ext->fofs), + le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len)); + + write_lock(&et->lock); + if (et->count) + goto out; + + en = __init_extent_tree(sbi, et, &ei); + if (en) { + spin_lock(&sbi->extent_lock); + list_add_tail(&en->list, &sbi->extent_list); + spin_unlock(&sbi->extent_lock); + } +out: + write_unlock(&et->lock); +} + +static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_node *en; + bool ret = false; + + f2fs_bug_on(sbi, !et); + + trace_f2fs_lookup_extent_tree_start(inode, pgofs); + + read_lock(&et->lock); + + if (et->largest.fofs <= pgofs && + et->largest.fofs + et->largest.len > pgofs) { + *ei = et->largest; + ret = true; + stat_inc_largest_node_hit(sbi); + goto out; + } + + en = __lookup_extent_tree(sbi, et, pgofs); + if (en) { + *ei = en->ei; + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) + list_move_tail(&en->list, &sbi->extent_list); + et->cached_en = en; + spin_unlock(&sbi->extent_lock); + ret = true; + } +out: + stat_inc_total_hit(sbi); + read_unlock(&et->lock); + + trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei); + return ret; +} + + +/* + * lookup extent at @fofs, if hit, return the extent + * if not, return NULL and + * @prev_ex: extent before fofs + * @next_ex: extent after fofs + * @insert_p: insert point for new extent at fofs + * in order to simpfy the insertion after. + * tree must stay unchanged between lookup and insertion. + */ +static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, + unsigned int fofs, + struct extent_node **prev_ex, + struct extent_node **next_ex, + struct rb_node ***insert_p, + struct rb_node **insert_parent) +{ + struct rb_node **pnode = &et->root.rb_node; + struct rb_node *parent = NULL, *tmp_node; + struct extent_node *en = et->cached_en; + + *insert_p = NULL; + *insert_parent = NULL; + *prev_ex = NULL; + *next_ex = NULL; + + if (RB_EMPTY_ROOT(&et->root)) + return NULL; + + if (en) { + struct extent_info *cei = &en->ei; + + if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) + goto lookup_neighbors; + } + + while (*pnode) { + parent = *pnode; + en = rb_entry(*pnode, struct extent_node, rb_node); + + if (fofs < en->ei.fofs) + pnode = &(*pnode)->rb_left; + else if (fofs >= en->ei.fofs + en->ei.len) + pnode = &(*pnode)->rb_right; + else + goto lookup_neighbors; + } + + *insert_p = pnode; + *insert_parent = parent; + + en = rb_entry(parent, struct extent_node, rb_node); + tmp_node = parent; + if (parent && fofs > en->ei.fofs) + tmp_node = rb_next(parent); + *next_ex = tmp_node ? + rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + + tmp_node = parent; + if (parent && fofs < en->ei.fofs) + tmp_node = rb_prev(parent); + *prev_ex = tmp_node ? + rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + return NULL; + +lookup_neighbors: + if (fofs == en->ei.fofs) { + /* lookup prev node for merging backward later */ + tmp_node = rb_prev(&en->rb_node); + *prev_ex = tmp_node ? + rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + } + if (fofs == en->ei.fofs + en->ei.len - 1) { + /* lookup next node for merging frontward later */ + tmp_node = rb_next(&en->rb_node); + *next_ex = tmp_node ? + rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + } + return en; +} + +static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct extent_node **den, + struct extent_node *prev_ex, + struct extent_node *next_ex) +{ + struct extent_node *en = NULL; + + if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) { + prev_ex->ei.len += ei->len; + ei = &prev_ex->ei; + en = prev_ex; + } + + if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { + if (en) { + __detach_extent_node(sbi, et, prev_ex); + *den = prev_ex; + } + next_ex->ei.fofs = ei->fofs; + next_ex->ei.blk = ei->blk; + next_ex->ei.len += ei->len; + en = next_ex; + } + + if (en) { + if (en->ei.len > et->largest.len) + et->largest = en->ei; + et->cached_en = en; + } + return en; +} + +static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct rb_node **insert_p, + struct rb_node *insert_parent) +{ + struct rb_node **p = &et->root.rb_node; + struct rb_node *parent = NULL; + struct extent_node *en = NULL; + + if (insert_p && insert_parent) { + parent = insert_parent; + p = insert_p; + goto do_insert; + } + + while (*p) { + parent = *p; + en = rb_entry(parent, struct extent_node, rb_node); + + if (ei->fofs < en->ei.fofs) + p = &(*p)->rb_left; + else if (ei->fofs >= en->ei.fofs + en->ei.len) + p = &(*p)->rb_right; + else + f2fs_bug_on(sbi, 1); + } +do_insert: + en = __attach_extent_node(sbi, et, ei, parent, p); + if (!en) + return NULL; + + if (en->ei.len > et->largest.len) + et->largest = en->ei; + et->cached_en = en; + return en; +} + +unsigned int f2fs_update_extent_tree_range(struct inode *inode, + pgoff_t fofs, block_t blkaddr, unsigned int len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL; + struct extent_node *prev_en = NULL, *next_en = NULL; + struct extent_info ei, dei, prev; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + unsigned int end = fofs + len; + unsigned int pos = (unsigned int)fofs; + + if (!et) + return false; + + write_lock(&et->lock); + + if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) { + write_unlock(&et->lock); + return false; + } + + prev = et->largest; + dei.len = 0; + + /* we do not guarantee that the largest extent is cached all the time */ + __drop_largest_extent(inode, fofs); + + /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ + en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en, + &insert_p, &insert_parent); + if (!en) { + if (next_en) { + en = next_en; + f2fs_bug_on(sbi, en->ei.fofs <= pos); + pos = en->ei.fofs; + } else { + /* + * skip searching in the tree since there is no + * larger extent node in the cache. + */ + goto update_extent; + } + } + + /* 2. invlidate all extent nodes in range [fofs, fofs + len - 1] */ + while (en) { + struct rb_node *node; + + if (pos >= end) + break; + + dei = en->ei; + en1 = en2 = NULL; + + node = rb_next(&en->rb_node); + + /* + * 2.1 there are four cases when we invalidate blkaddr in extent + * node, |V: valid address, X: will be invalidated| + */ + /* case#1, invalidate right part of extent node |VVVVVXXXXX| */ + if (pos > dei.fofs && end >= dei.fofs + dei.len) { + en->ei.len = pos - dei.fofs; + + if (en->ei.len < F2FS_MIN_EXTENT_LEN) { + __detach_extent_node(sbi, et, en); + insert_p = NULL; + insert_parent = NULL; + goto update; + } + + if (__is_extent_same(&dei, &et->largest)) + et->largest = en->ei; + goto next; + } + + /* case#2, invalidate left part of extent node |XXXXXVVVVV| */ + if (pos <= dei.fofs && end < dei.fofs + dei.len) { + en->ei.fofs = end; + en->ei.blk += end - dei.fofs; + en->ei.len -= end - dei.fofs; + + if (en->ei.len < F2FS_MIN_EXTENT_LEN) { + __detach_extent_node(sbi, et, en); + insert_p = NULL; + insert_parent = NULL; + goto update; + } + + if (__is_extent_same(&dei, &et->largest)) + et->largest = en->ei; + goto next; + } + + __detach_extent_node(sbi, et, en); + + /* + * if we remove node in rb-tree, our parent node pointer may + * point the wrong place, discard them. + */ + insert_p = NULL; + insert_parent = NULL; + + /* case#3, invalidate entire extent node |XXXXXXXXXX| */ + if (pos <= dei.fofs && end >= dei.fofs + dei.len) { + if (__is_extent_same(&dei, &et->largest)) + et->largest.len = 0; + goto update; + } + + /* + * case#4, invalidate data in the middle of extent node + * |VVVXXXXVVV| + */ + if (dei.len > F2FS_MIN_EXTENT_LEN) { + unsigned int endofs; + + /* insert left part of split extent into cache */ + if (pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { + set_extent_info(&ei, dei.fofs, dei.blk, + pos - dei.fofs); + en1 = __insert_extent_tree(sbi, et, &ei, + NULL, NULL); + } + + /* insert right part of split extent into cache */ + endofs = dei.fofs + dei.len; + if (endofs - end >= F2FS_MIN_EXTENT_LEN) { + set_extent_info(&ei, end, + end - dei.fofs + dei.blk, + endofs - end); + en2 = __insert_extent_tree(sbi, et, &ei, + NULL, NULL); + } + } +update: + /* 2.2 update in global extent list */ + spin_lock(&sbi->extent_lock); + if (en && !list_empty(&en->list)) + list_del(&en->list); + if (en1) + list_add_tail(&en1->list, &sbi->extent_list); + if (en2) + list_add_tail(&en2->list, &sbi->extent_list); + spin_unlock(&sbi->extent_lock); + + /* 2.3 release extent node */ + if (en) + kmem_cache_free(extent_node_slab, en); +next: + en = node ? rb_entry(node, struct extent_node, rb_node) : NULL; + next_en = en; + if (en) + pos = en->ei.fofs; + } + +update_extent: + /* 3. update extent in extent cache */ + if (blkaddr) { + struct extent_node *den = NULL; + + set_extent_info(&ei, fofs, blkaddr, len); + en3 = __try_merge_extent_node(sbi, et, &ei, &den, + prev_en, next_en); + if (!en3) + en3 = __insert_extent_tree(sbi, et, &ei, + insert_p, insert_parent); + + /* give up extent_cache, if split and small updates happen */ + if (dei.len >= 1 && + prev.len < F2FS_MIN_EXTENT_LEN && + et->largest.len < F2FS_MIN_EXTENT_LEN) { + et->largest.len = 0; + set_inode_flag(F2FS_I(inode), FI_NO_EXTENT); + } + + spin_lock(&sbi->extent_lock); + if (en3) { + if (list_empty(&en3->list)) + list_add_tail(&en3->list, &sbi->extent_list); + else + list_move_tail(&en3->list, &sbi->extent_list); + } + if (den && !list_empty(&den->list)) + list_del(&den->list); + spin_unlock(&sbi->extent_lock); + + if (den) + kmem_cache_free(extent_node_slab, den); + } + + if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) + __free_extent_tree(sbi, et, true); + + write_unlock(&et->lock); + + return !__is_extent_same(&prev, &et->largest); +} + +unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +{ + struct extent_tree *treevec[EXT_TREE_VEC_SIZE]; + struct extent_node *en, *tmp; + unsigned long ino = F2FS_ROOT_INO(sbi); + struct radix_tree_root *root = &sbi->extent_tree_root; + unsigned int found; + unsigned int node_cnt = 0, tree_cnt = 0; + int remained; + + if (!test_opt(sbi, EXTENT_CACHE)) + return 0; + + if (!down_write_trylock(&sbi->extent_tree_lock)) + goto out; + + /* 1. remove unreferenced extent tree */ + while ((found = radix_tree_gang_lookup(root, + (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { + unsigned i; + + ino = treevec[found - 1]->ino + 1; + for (i = 0; i < found; i++) { + struct extent_tree *et = treevec[i]; + + if (!atomic_read(&et->refcount)) { + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et, true); + write_unlock(&et->lock); + + radix_tree_delete(root, et->ino); + kmem_cache_free(extent_tree_slab, et); + sbi->total_ext_tree--; + tree_cnt++; + + if (node_cnt + tree_cnt >= nr_shrink) + goto unlock_out; + } + } + } + up_write(&sbi->extent_tree_lock); + + /* 2. remove LRU extent entries */ + if (!down_write_trylock(&sbi->extent_tree_lock)) + goto out; + + remained = nr_shrink - (node_cnt + tree_cnt); + + spin_lock(&sbi->extent_lock); + list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) { + if (!remained--) + break; + list_del_init(&en->list); + } + spin_unlock(&sbi->extent_lock); + + while ((found = radix_tree_gang_lookup(root, + (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { + unsigned i; + + ino = treevec[found - 1]->ino + 1; + for (i = 0; i < found; i++) { + struct extent_tree *et = treevec[i]; + + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et, false); + write_unlock(&et->lock); + + if (node_cnt + tree_cnt >= nr_shrink) + break; + } + } +unlock_out: + up_write(&sbi->extent_tree_lock); +out: + trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); + + return node_cnt + tree_cnt; +} + +unsigned int f2fs_destroy_extent_node(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + unsigned int node_cnt = 0; + + if (!et) + return 0; + + write_lock(&et->lock); + node_cnt = __free_extent_tree(sbi, et, true); + write_unlock(&et->lock); + + return node_cnt; +} + +void f2fs_destroy_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + unsigned int node_cnt = 0; + + if (!et) + return; + + if (inode->i_nlink && !is_bad_inode(inode) && et->count) { + atomic_dec(&et->refcount); + return; + } + + /* free all extent info belong to this extent tree */ + node_cnt = f2fs_destroy_extent_node(inode); + + /* delete extent tree entry in radix tree */ + down_write(&sbi->extent_tree_lock); + atomic_dec(&et->refcount); + f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count); + radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); + kmem_cache_free(extent_tree_slab, et); + sbi->total_ext_tree--; + up_write(&sbi->extent_tree_lock); + + F2FS_I(inode)->extent_tree = NULL; + + trace_f2fs_destroy_extent_tree(inode, node_cnt); +} + +bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + if (!f2fs_may_extent_tree(inode)) + return false; + + return f2fs_lookup_extent_tree(inode, pgofs, ei); +} + +void f2fs_update_extent_cache(struct dnode_of_data *dn) +{ + struct f2fs_inode_info *fi = F2FS_I(dn->inode); + pgoff_t fofs; + + if (!f2fs_may_extent_tree(dn->inode)) + return; + + f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR); + + + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + + dn->ofs_in_node; + + if (f2fs_update_extent_tree_range(dn->inode, fofs, dn->data_blkaddr, 1)) + sync_inode_page(dn); +} + +void f2fs_update_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, block_t blkaddr, unsigned int len) + +{ + if (!f2fs_may_extent_tree(dn->inode)) + return; + + if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len)) + sync_inode_page(dn); +} + +void init_extent_cache_info(struct f2fs_sb_info *sbi) +{ + INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); + init_rwsem(&sbi->extent_tree_lock); + INIT_LIST_HEAD(&sbi->extent_list); + spin_lock_init(&sbi->extent_lock); + sbi->total_ext_tree = 0; + atomic_set(&sbi->total_ext_node, 0); +} + +int __init create_extent_cache(void) +{ + extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree", + sizeof(struct extent_tree)); + if (!extent_tree_slab) + return -ENOMEM; + extent_node_slab = f2fs_kmem_cache_create("f2fs_extent_node", + sizeof(struct extent_node)); + if (!extent_node_slab) { + kmem_cache_destroy(extent_tree_slab); + return -ENOMEM; + } + return 0; +} + +void destroy_extent_cache(void) +{ + kmem_cache_destroy(extent_node_slab); + kmem_cache_destroy(extent_tree_slab); +} diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a8327ed73898..f1a90ffd7cad 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -19,6 +19,7 @@ #include <linux/magic.h> #include <linux/kobject.h> #include <linux/sched.h> +#include <linux/bio.h> #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) @@ -228,6 +229,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, #define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) +#define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) #define F2FS_IOC_SET_ENCRYPTION_POLICY \ _IOR('f', 19, struct f2fs_encryption_policy) @@ -320,7 +322,7 @@ enum { */ }; -#define F2FS_LINK_MAX 32000 /* maximum link count per file */ +#define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ @@ -349,6 +351,7 @@ struct extent_tree { nid_t ino; /* inode number */ struct rb_root root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ + struct extent_info largest; /* largested extent info */ rwlock_t lock; /* protect extent info rb-tree */ atomic_t refcount; /* reference count of rb-tree */ unsigned int count; /* # of extent node in rb-tree*/ @@ -372,6 +375,12 @@ struct f2fs_map_blocks { unsigned int m_flags; }; +/* for flag in get_data_block */ +#define F2FS_GET_BLOCK_READ 0 +#define F2FS_GET_BLOCK_DIO 1 +#define F2FS_GET_BLOCK_FIEMAP 2 +#define F2FS_GET_BLOCK_BMAP 3 + /* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. */ @@ -420,14 +429,13 @@ struct f2fs_inode_info { unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ - struct extent_info ext; /* in-memory extent cache entry */ - rwlock_t ext_lock; /* rwlock for single extent cache */ struct inode_entry *dirty_dir; /* the pointer of dirty dir */ - struct radix_tree_root inmem_root; /* radix tree for inmem pages */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct mutex inmem_lock; /* lock for inmemory pages */ + struct extent_tree *extent_tree; /* cached extent_tree entry */ + #ifdef CONFIG_F2FS_FS_ENCRYPTION /* Encryption params */ struct f2fs_crypt_info *i_crypt_info; @@ -779,7 +787,11 @@ struct f2fs_sb_info { unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ atomic_t inplace_count; /* # of inplace update */ - int total_hit_ext, read_hit_ext; /* extent cache hit ratio */ + atomic_t total_hit_ext; /* # of lookup extent cache */ + atomic_t read_hit_rbtree; /* # of hit rbtree extent node */ + atomic_t read_hit_largest; /* # of hit largest extent node */ + atomic_t read_hit_cached; /* # of hit cached extent node */ + atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ int bg_gc; /* background gc calls */ @@ -791,6 +803,11 @@ struct f2fs_sb_info { /* For sysfs suppport */ struct kobject s_kobj; struct completion s_kobj_unregister; + + /* For shrinker support */ + struct list_head s_list; + struct mutex umount_mutex; + unsigned int shrinker_run_no; }; /* @@ -1039,7 +1056,8 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_dec_dirty_pages(struct inode *inode) { - if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode)) + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) return; atomic_dec(&F2FS_I(inode)->dirty_pages); @@ -1234,16 +1252,24 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *entry; -retry: - entry = kmem_cache_alloc(cachep, flags); - if (!entry) { - cond_resched(); - goto retry; - } + entry = kmem_cache_alloc(cachep, flags); + if (!entry) + entry = kmem_cache_alloc(cachep, flags | __GFP_NOFAIL); return entry; } +static inline struct bio *f2fs_bio_alloc(int npages) +{ + struct bio *bio; + + /* No failure on bio allocation */ + bio = bio_alloc(GFP_NOIO, npages); + if (!bio) + bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); + return bio; +} + static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item) { @@ -1342,6 +1368,7 @@ enum { FI_INC_LINK, /* need to increment i_nlink */ FI_ACL_MODE, /* indicate acl mode */ FI_NO_ALLOC, /* should not allocate any blocks */ + FI_FREE_NID, /* free allocated nide */ FI_UPDATE_DIR, /* should update inode block for consistency */ FI_DELAY_IPUT, /* used for the recovery */ FI_NO_EXTENT, /* not to use the extent cache */ @@ -1541,6 +1568,17 @@ static inline bool is_dot_dotdot(const struct qstr *str) return false; } +static inline bool f2fs_may_extent_tree(struct inode *inode) +{ + mode_t mode = inode->i_mode; + + if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) || + is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) + return false; + + return S_ISREG(mode); +} + #define get_inode_mode(i) \ ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) @@ -1557,7 +1595,7 @@ static inline bool is_dot_dotdot(const struct qstr *str) int f2fs_sync_file(struct file *, loff_t, loff_t, int); void truncate_data_blocks(struct dnode_of_data *); int truncate_blocks(struct inode *, u64, bool); -void f2fs_truncate(struct inode *); +int f2fs_truncate(struct inode *, bool); int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); int f2fs_setattr(struct dentry *, struct iattr *); int truncate_hole(struct inode *, pgoff_t, pgoff_t); @@ -1649,7 +1687,7 @@ int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); int truncate_xattr_node(struct inode *, struct page *); int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); -void remove_inode_page(struct inode *); +int remove_inode_page(struct inode *); struct page *new_inode_page(struct inode *); struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); void ra_node_page(struct f2fs_sb_info *, nid_t); @@ -1660,6 +1698,7 @@ int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *); bool alloc_nid(struct f2fs_sb_info *, nid_t *); void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); +int try_to_free_nids(struct f2fs_sb_info *, int); void recover_inline_xattr(struct inode *, struct page *); void recover_xattr_data(struct inode *, struct page *, block_t); int recover_inode_page(struct f2fs_sb_info *, struct page *); @@ -1675,7 +1714,7 @@ void destroy_node_manager_caches(void); * segment.c */ void register_inmem_page(struct inode *, struct page *); -void commit_inmem_pages(struct inode *, bool); +int commit_inmem_pages(struct inode *, bool); void f2fs_balance_fs(struct f2fs_sb_info *); void f2fs_balance_fs_bg(struct f2fs_sb_info *); int f2fs_issue_flush(struct f2fs_sb_info *); @@ -1685,7 +1724,7 @@ void invalidate_blocks(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); -void discard_next_dnode(struct f2fs_sb_info *, block_t); +bool discard_next_dnode(struct f2fs_sb_info *, block_t); int npages_for_summary_flush(struct f2fs_sb_info *, bool); void allocate_new_segments(struct f2fs_sb_info *); int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); @@ -1727,7 +1766,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); void add_orphan_inode(struct f2fs_sb_info *, nid_t); void remove_orphan_inode(struct f2fs_sb_info *, nid_t); -void recover_orphan_inodes(struct f2fs_sb_info *); +int recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); void update_dirty_page(struct inode *, struct page *); void add_dirty_dir_inode(struct inode *); @@ -1746,21 +1785,14 @@ int f2fs_submit_page_bio(struct f2fs_io_info *); void f2fs_submit_page_mbio(struct f2fs_io_info *); void set_data_blkaddr(struct dnode_of_data *); int reserve_new_block(struct dnode_of_data *); +int f2fs_get_block(struct dnode_of_data *, pgoff_t); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); -void f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); -void f2fs_destroy_extent_tree(struct inode *); -void f2fs_init_extent_cache(struct inode *, struct f2fs_extent *); -void f2fs_update_extent_cache(struct dnode_of_data *); -void f2fs_preserve_extent_tree(struct inode *); struct page *get_read_data_page(struct inode *, pgoff_t, int); struct page *find_data_page(struct inode *, pgoff_t); struct page *get_lock_data_page(struct inode *, pgoff_t); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); int do_write_data_page(struct f2fs_io_info *); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); -void init_extent_cache_info(struct f2fs_sb_info *); -int __init create_extent_cache(void); -void destroy_extent_cache(void); void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); int f2fs_release_page(struct page *, gfp_t); @@ -1788,11 +1820,13 @@ struct f2fs_stat_info { struct f2fs_sb_info *sbi; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; - int hit_ext, total_ext, ext_tree, ext_node; + int hit_largest, hit_cached, hit_rbtree, hit_total, total_ext; + int ext_tree, ext_node; int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; - int bg_gc, inline_inode, inline_dir, inmem_pages, wb_pages; + int bg_gc, inmem_pages, wb_pages; + int inline_xattr, inline_inode, inline_dir; unsigned int valid_count, valid_node_count, valid_inode_count; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -1823,8 +1857,20 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) #define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) #define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) -#define stat_inc_total_hit(sb) ((F2FS_SB(sb))->total_hit_ext++) -#define stat_inc_read_hit(sb) ((F2FS_SB(sb))->read_hit_ext++) +#define stat_inc_total_hit(sbi) (atomic_inc(&(sbi)->total_hit_ext)) +#define stat_inc_rbtree_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_rbtree)) +#define stat_inc_largest_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_largest)) +#define stat_inc_cached_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_cached)) +#define stat_inc_inline_xattr(inode) \ + do { \ + if (f2fs_has_inline_xattr(inode)) \ + (atomic_inc(&F2FS_I_SB(inode)->inline_xattr)); \ + } while (0) +#define stat_dec_inline_xattr(inode) \ + do { \ + if (f2fs_has_inline_xattr(inode)) \ + (atomic_dec(&F2FS_I_SB(inode)->inline_xattr)); \ + } while (0) #define stat_inc_inline_inode(inode) \ do { \ if (f2fs_has_inline_data(inode)) \ @@ -1894,7 +1940,11 @@ void f2fs_destroy_root_stats(void); #define stat_inc_dirty_dir(sbi) #define stat_dec_dirty_dir(sbi) #define stat_inc_total_hit(sb) -#define stat_inc_read_hit(sb) +#define stat_inc_rbtree_node_hit(sb) +#define stat_inc_largest_node_hit(sbi) +#define stat_inc_cached_node_hit(sbi) +#define stat_inc_inline_xattr(inode) +#define stat_dec_inline_xattr(inode) #define stat_inc_inline_inode(inode) #define stat_dec_inline_inode(inode) #define stat_inc_inline_dir(inode) @@ -1950,6 +2000,30 @@ int f2fs_read_inline_dir(struct file *, struct dir_context *, struct f2fs_str *); /* + * shrinker.c + */ +unsigned long f2fs_shrink_count(struct shrinker *, struct shrink_control *); +unsigned long f2fs_shrink_scan(struct shrinker *, struct shrink_control *); +void f2fs_join_shrinker(struct f2fs_sb_info *); +void f2fs_leave_shrinker(struct f2fs_sb_info *); + +/* + * extent_cache.c + */ +unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); +void f2fs_drop_largest_extent(struct inode *, pgoff_t); +void f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); +unsigned int f2fs_destroy_extent_node(struct inode *); +void f2fs_destroy_extent_tree(struct inode *); +bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *); +void f2fs_update_extent_cache(struct dnode_of_data *); +void f2fs_update_extent_cache_range(struct dnode_of_data *dn, + pgoff_t, block_t, unsigned int); +void init_extent_cache_info(struct f2fs_sb_info *); +int __init create_extent_cache(void); +void destroy_extent_cache(void); + +/* * crypto support */ static inline int f2fs_encrypted_inode(struct inode *inode) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ada2a3dd701a..8120f8685141 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -27,6 +27,7 @@ #include "segment.h" #include "xattr.h" #include "acl.h" +#include "gc.h" #include "trace.h" #include <trace/events/f2fs.h> @@ -85,6 +86,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, mapped: /* fill the page */ f2fs_wait_on_page_writeback(page, DATA); + /* if gced page is attached, don't write to cold segment */ + clear_cold_data(page); out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(err); @@ -203,8 +206,8 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } /* if the inode is dirty, let's recover all the time */ - if (!datasync && is_inode_flag_set(fi, FI_DIRTY_INODE)) { - update_inode_page(inode); + if (!datasync) { + f2fs_write_inode(inode, NULL); goto go_write; } @@ -442,9 +445,9 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) int truncate_data_blocks_range(struct dnode_of_data *dn, int count) { - int nr_free = 0, ofs = dn->ofs_in_node; struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_node *raw_node; + int nr_free = 0, ofs = dn->ofs_in_node, len = count; __le32 *addr; raw_node = F2FS_NODE(dn->node_page); @@ -457,14 +460,22 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) dn->data_blkaddr = NULL_ADDR; set_data_blkaddr(dn); - f2fs_update_extent_cache(dn); invalidate_blocks(sbi, blkaddr); if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) clear_inode_flag(F2FS_I(dn->inode), FI_FIRST_BLOCK_WRITTEN); nr_free++; } + if (nr_free) { + pgoff_t fofs; + /* + * once we invalidate valid blkaddr in range [ofs, ofs + count], + * we will invalidate all blkaddr in the whole range. + */ + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), + F2FS_I(dn->inode)) + ofs; + f2fs_update_extent_cache_range(dn, fofs, 0, len); dec_valid_block_count(sbi, dn->inode, nr_free); set_page_dirty(dn->node_page); sync_inode_page(dn); @@ -576,24 +587,30 @@ out: return err; } -void f2fs_truncate(struct inode *inode) +int f2fs_truncate(struct inode *inode, bool lock) { + int err; + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) - return; + return 0; trace_f2fs_truncate(inode); /* we should check inline_data size */ if (f2fs_has_inline_data(inode) && !f2fs_may_inline_data(inode)) { - if (f2fs_convert_inline_inode(inode)) - return; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; } - if (!truncate_blocks(inode, i_size_read(inode), true)) { - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); - } + err = truncate_blocks(inode, i_size_read(inode), lock); + if (err) + return err; + + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + return 0; } int f2fs_getattr(struct vfsmount *mnt, @@ -653,7 +670,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_size <= i_size_read(inode)) { truncate_setsize(inode, attr->ia_size); - f2fs_truncate(inode); + err = f2fs_truncate(inode, true); + if (err) + return err; f2fs_balance_fs(F2FS_I_SB(inode)); } else { /* @@ -692,14 +711,14 @@ const struct inode_operations f2fs_file_inode_operations = { .fiemap = f2fs_fiemap, }; -static void fill_zero(struct inode *inode, pgoff_t index, +static int fill_zero(struct inode *inode, pgoff_t index, loff_t start, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page; if (!len) - return; + return 0; f2fs_balance_fs(sbi); @@ -707,12 +726,14 @@ static void fill_zero(struct inode *inode, pgoff_t index, page = get_new_data_page(inode, NULL, index, false); f2fs_unlock_op(sbi); - if (!IS_ERR(page)) { - f2fs_wait_on_page_writeback(page, DATA); - zero_user(page, start, len); - set_page_dirty(page); - f2fs_put_page(page, 1); - } + if (IS_ERR(page)) + return PTR_ERR(page); + + f2fs_wait_on_page_writeback(page, DATA); + zero_user(page, start, len); + set_page_dirty(page); + f2fs_put_page(page, 1); + return 0; } int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) @@ -760,14 +781,22 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); if (pg_start == pg_end) { - fill_zero(inode, pg_start, off_start, + ret = fill_zero(inode, pg_start, off_start, off_end - off_start); + if (ret) + return ret; } else { - if (off_start) - fill_zero(inode, pg_start++, off_start, - PAGE_CACHE_SIZE - off_start); - if (off_end) - fill_zero(inode, pg_end, 0, off_end); + if (off_start) { + ret = fill_zero(inode, pg_start++, off_start, + PAGE_CACHE_SIZE - off_start); + if (ret) + return ret; + } + if (off_end) { + ret = fill_zero(inode, pg_end, 0, off_end); + if (ret) + return ret; + } if (pg_start < pg_end) { struct address_space *mapping = inode->i_mapping; @@ -797,11 +826,11 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; int ret = 0; - f2fs_lock_op(sbi); - for (; end < nrpages; start++, end++) { block_t new_addr, old_addr; + f2fs_lock_op(sbi); + set_new_dnode(&dn, inode, NULL, NULL, 0); ret = get_dnode_of_data(&dn, end, LOOKUP_NODE_RA); if (ret && ret != -ENOENT) { @@ -817,13 +846,16 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) if (new_addr == NULL_ADDR) { set_new_dnode(&dn, inode, NULL, NULL, 0); ret = get_dnode_of_data(&dn, start, LOOKUP_NODE_RA); - if (ret && ret != -ENOENT) + if (ret && ret != -ENOENT) { goto out; - else if (ret == -ENOENT) + } else if (ret == -ENOENT) { + f2fs_unlock_op(sbi); continue; + } if (dn.data_blkaddr == NULL_ADDR) { f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); continue; } else { truncate_data_blocks_range(&dn, 1); @@ -862,8 +894,9 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) f2fs_put_dnode(&dn); } + f2fs_unlock_op(sbi); } - ret = 0; + return 0; out: f2fs_unlock_op(sbi); return ret; @@ -885,6 +918,14 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) return -EINVAL; + f2fs_balance_fs(F2FS_I_SB(inode)); + + if (f2fs_has_inline_data(inode)) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } + pg_start = offset >> PAGE_CACHE_SHIFT; pg_end = (offset + len) >> PAGE_CACHE_SHIFT; @@ -946,14 +987,21 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); if (pg_start == pg_end) { - fill_zero(inode, pg_start, off_start, off_end - off_start); + ret = fill_zero(inode, pg_start, off_start, + off_end - off_start); + if (ret) + return ret; + if (offset + len > new_size) new_size = offset + len; new_size = max_t(loff_t, new_size, offset + len); } else { if (off_start) { - fill_zero(inode, pg_start++, off_start, - PAGE_CACHE_SIZE - off_start); + ret = fill_zero(inode, pg_start++, off_start, + PAGE_CACHE_SIZE - off_start); + if (ret) + return ret; + new_size = max_t(loff_t, new_size, pg_start << PAGE_CACHE_SHIFT); } @@ -995,7 +1043,10 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, } if (off_end) { - fill_zero(inode, pg_end, 0, off_end); + ret = fill_zero(inode, pg_end, 0, off_end); + if (ret) + goto out; + new_size = max_t(loff_t, new_size, offset + len); } } @@ -1033,6 +1084,12 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi); + if (f2fs_has_inline_data(inode)) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } + ret = truncate_blocks(inode, i_size_read(inode), true); if (ret) return ret; @@ -1302,6 +1359,7 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg) static int f2fs_ioc_start_atomic_write(struct file *filp) { struct inode *inode = file_inode(filp); + int ret; if (!inode_owner_or_capable(inode)) return -EACCES; @@ -1311,9 +1369,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (f2fs_is_atomic_file(inode)) return 0; - set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - return f2fs_convert_inline_inode(inode); + set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + return 0; } static int f2fs_ioc_commit_atomic_write(struct file *filp) @@ -1331,18 +1392,23 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (ret) return ret; - if (f2fs_is_atomic_file(inode)) - commit_inmem_pages(inode, false); + if (f2fs_is_atomic_file(inode)) { + clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + ret = commit_inmem_pages(inode, false); + if (ret) + goto err_out; + } - ret = f2fs_sync_file(filp, 0, LONG_MAX, 0); + ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); +err_out: mnt_drop_write_file(filp); - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); return ret; } static int f2fs_ioc_start_volatile_write(struct file *filp) { struct inode *inode = file_inode(filp); + int ret; if (!inode_owner_or_capable(inode)) return -EACCES; @@ -1350,9 +1416,12 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (f2fs_is_volatile_file(inode)) return 0; - set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - return f2fs_convert_inline_inode(inode); + set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + return 0; } static int f2fs_ioc_release_volatile_write(struct file *filp) @@ -1387,8 +1456,8 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) f2fs_balance_fs(F2FS_I_SB(inode)); if (f2fs_is_atomic_file(inode)) { - commit_inmem_pages(inode, false); clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + commit_inmem_pages(inode, true); } if (f2fs_is_volatile_file(inode)) @@ -1543,6 +1612,35 @@ got_it: return 0; } +static int f2fs_ioc_gc(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + __u32 i, count; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(count, (__u32 __user *)arg)) + return -EFAULT; + + if (!count || count > F2FS_BATCH_GC_MAX_NUM) + return -EINVAL; + + for (i = 0; i < count; i++) { + if (!mutex_trylock(&sbi->gc_mutex)) + break; + + if (f2fs_gc(sbi)) + break; + } + + if (put_user(i, (__u32 __user *)arg)) + return -EFAULT; + + return 0; +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -1572,6 +1670,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_get_encryption_policy(filp, arg); case F2FS_IOC_GET_ENCRYPTION_PWSALT: return f2fs_ioc_get_encryption_pwsalt(filp, arg); + case F2FS_IOC_GARBAGE_COLLECT: + return f2fs_ioc_gc(filp, arg); default: return -ENOTTY; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e1e73617d13b..782b8e72c094 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -391,23 +391,27 @@ static int check_valid_map(struct f2fs_sb_info *sbi, * On validity, copy that node with cold status, otherwise (invalid node) * ignore that. */ -static void gc_node_segment(struct f2fs_sb_info *sbi, +static int gc_node_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, unsigned int segno, int gc_type) { bool initial = true; struct f2fs_summary *entry; + block_t start_addr; int off; + start_addr = START_BLOCK(sbi, segno); + next_step: entry = sum; for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { nid_t nid = le32_to_cpu(entry->nid); struct page *node_page; + struct node_info ni; /* stop BG_GC if there is not enough free sections. */ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) - return; + return 0; if (check_valid_map(sbi, segno, off) == 0) continue; @@ -426,6 +430,12 @@ next_step: continue; } + get_node_info(sbi, nid, &ni); + if (ni.blk_addr != start_addr + off) { + f2fs_put_page(node_page, 1); + continue; + } + /* set page dirty and write it */ if (gc_type == FG_GC) { f2fs_wait_on_page_writeback(node_page, NODE); @@ -451,13 +461,11 @@ next_step: }; sync_node_pages(sbi, 0, &wbc); - /* - * In the case of FG_GC, it'd be better to reclaim this victim - * completely. - */ - if (get_valid_blocks(sbi, segno, 1) != 0) - goto next_step; + /* return 1 only if FG_GC succefully reclaimed one */ + if (get_valid_blocks(sbi, segno, 1) == 0) + return 1; } + return 0; } /* @@ -487,7 +495,7 @@ block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi) return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi); } -static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct node_info *dni, block_t blkaddr, unsigned int *nofs) { struct page *node_page; @@ -500,13 +508,13 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, node_page = get_node_page(sbi, nid); if (IS_ERR(node_page)) - return 0; + return false; get_node_info(sbi, nid, dni); if (sum->version != dni->version) { f2fs_put_page(node_page, 1); - return 0; + return false; } *nofs = ofs_of_node(node_page); @@ -514,8 +522,8 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, f2fs_put_page(node_page, 1); if (source_blkaddr != blkaddr) - return 0; - return 1; + return false; + return true; } static void move_encrypted_block(struct inode *inode, block_t bidx) @@ -552,31 +560,46 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) fio.page = page; fio.blk_addr = dn.data_blkaddr; - fio.encrypted_page = grab_cache_page(META_MAPPING(fio.sbi), fio.blk_addr); + fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), + fio.blk_addr, + FGP_LOCK|FGP_CREAT, + GFP_NOFS); if (!fio.encrypted_page) goto put_out; - f2fs_submit_page_bio(&fio); + err = f2fs_submit_page_bio(&fio); + if (err) + goto put_page_out; + + /* write page */ + lock_page(fio.encrypted_page); + + if (unlikely(!PageUptodate(fio.encrypted_page))) + goto put_page_out; + if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) + goto put_page_out; + + set_page_dirty(fio.encrypted_page); + f2fs_wait_on_page_writeback(fio.encrypted_page, META); + if (clear_page_dirty_for_io(fio.encrypted_page)) + dec_page_count(fio.sbi, F2FS_DIRTY_META); + + set_page_writeback(fio.encrypted_page); /* allocate block address */ f2fs_wait_on_page_writeback(dn.node_page, NODE); - allocate_data_block(fio.sbi, NULL, fio.blk_addr, &fio.blk_addr, &sum, CURSEG_COLD_DATA); - dn.data_blkaddr = fio.blk_addr; - - /* write page */ - lock_page(fio.encrypted_page); - set_page_writeback(fio.encrypted_page); fio.rw = WRITE_SYNC; f2fs_submit_page_mbio(&fio); + dn.data_blkaddr = fio.blk_addr; set_data_blkaddr(&dn); f2fs_update_extent_cache(&dn); set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); if (page->index == 0) set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); - +put_page_out: f2fs_put_page(fio.encrypted_page, 1); put_out: f2fs_put_dnode(&dn); @@ -605,8 +628,8 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) .page = page, .encrypted_page = NULL, }; + set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA); - if (clear_page_dirty_for_io(page)) inode_dec_dirty_pages(inode); set_cold_data(page); @@ -624,7 +647,7 @@ out: * If the parent node is not valid or the data block address is different, * the victim data block is ignored. */ -static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct gc_inode_list *gc_list, unsigned int segno, int gc_type) { struct super_block *sb = sbi->sb; @@ -647,7 +670,7 @@ next_step: /* stop BG_GC if there is not enough free sections. */ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) - return; + return 0; if (check_valid_map(sbi, segno, off) == 0) continue; @@ -658,7 +681,7 @@ next_step: } /* Get an inode by ino with checking validity */ - if (check_dnode(sbi, entry, &dni, start_addr + off, &nofs) == 0) + if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs)) continue; if (phase == 1) { @@ -712,15 +735,11 @@ next_step: if (gc_type == FG_GC) { f2fs_submit_merged_bio(sbi, DATA, WRITE); - /* - * In the case of FG_GC, it'd be better to reclaim this victim - * completely. - */ - if (get_valid_blocks(sbi, segno, 1) != 0) { - phase = 2; - goto next_step; - } + /* return 1 only if FG_GC succefully reclaimed one */ + if (get_valid_blocks(sbi, segno, 1) == 0) + return 1; } + return 0; } static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, @@ -736,12 +755,13 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, return ret; } -static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, +static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, struct gc_inode_list *gc_list, int gc_type) { struct page *sum_page; struct f2fs_summary_block *sum; struct blk_plug plug; + int nfree = 0; /* read segment summary of victim */ sum_page = get_sum_page(sbi, segno); @@ -761,10 +781,11 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, switch (GET_SUM_TYPE((&sum->footer))) { case SUM_TYPE_NODE: - gc_node_segment(sbi, sum->entries, segno, gc_type); + nfree = gc_node_segment(sbi, sum->entries, segno, gc_type); break; case SUM_TYPE_DATA: - gc_data_segment(sbi, sum->entries, gc_list, segno, gc_type); + nfree = gc_data_segment(sbi, sum->entries, gc_list, + segno, gc_type); break; } blk_finish_plug(&plug); @@ -773,11 +794,13 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, stat_inc_call_count(sbi->stat_info); f2fs_put_page(sum_page, 0); + return nfree; } int f2fs_gc(struct f2fs_sb_info *sbi) { - unsigned int segno, i; + unsigned int segno = NULL_SEGNO; + unsigned int i; int gc_type = BG_GC; int nfree = 0; int ret = -1; @@ -796,10 +819,11 @@ gc_more: if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { gc_type = FG_GC; - write_checkpoint(sbi, &cpc); + if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi)) + write_checkpoint(sbi, &cpc); } - if (!__get_victim(sbi, &segno, gc_type)) + if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) goto stop; ret = 0; @@ -809,13 +833,10 @@ gc_more: META_SSA); for (i = 0; i < sbi->segs_per_sec; i++) - do_garbage_collect(sbi, segno + i, &gc_list, gc_type); + nfree += do_garbage_collect(sbi, segno + i, &gc_list, gc_type); - if (gc_type == FG_GC) { + if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - nfree++; - WARN_ON(get_valid_blocks(sbi, segno, sbi->segs_per_sec)); - } if (has_not_enough_free_secs(sbi, nfree)) goto gc_more; diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index b4a65be9f7d3..c5a055b3376e 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -19,6 +19,12 @@ #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ +/* + * with this macro, we can control the max time we do garbage collection, + * when user triggers batch mode gc by ioctl. + */ +#define F2FS_BATCH_GC_MAX_NUM 16 + /* Search max. number of dirty segments to select a victim segment */ #define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */ diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 38e75fb1e488..3d143be42895 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -141,6 +141,8 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) kunmap_atomic(dst_addr); SetPageUptodate(page); no_update: + set_page_dirty(page); + /* clear dirty state */ dirty = clear_page_dirty_for_io(page); @@ -358,6 +360,10 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, return 0; } +/* + * NOTE: ipage is grabbed by caller, but if any error occurs, we should + * release ipage in this function. + */ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, struct f2fs_inline_dentry *inline_dentry) { @@ -367,8 +373,10 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, int err; page = grab_cache_page(dir->i_mapping, 0); - if (!page) + if (!page) { + f2fs_put_page(ipage, 1); return -ENOMEM; + } set_new_dnode(&dn, dir, ipage, NULL, 0); err = f2fs_reserve_block(&dn, 0); @@ -376,13 +384,21 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, goto out; f2fs_wait_on_page_writeback(page, DATA); - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); dentry_blk = kmap_atomic(page); /* copy data from inline dentry block to new dentry block */ memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap, INLINE_DENTRY_BITMAP_SIZE); + memset(dentry_blk->dentry_bitmap + INLINE_DENTRY_BITMAP_SIZE, 0, + SIZE_OF_DENTRY_BITMAP - INLINE_DENTRY_BITMAP_SIZE); + /* + * we do not need to zero out remainder part of dentry and filename + * field, since we have used bitmap for marking the usage status of + * them, besides, we can also ignore copying/zeroing reserved space + * of dentry block, because them haven't been used so far. + */ memcpy(dentry_blk->dentry, inline_dentry->dentry, sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY); memcpy(dentry_blk->filename, inline_dentry->filename, @@ -432,8 +448,9 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, slots, NR_INLINE_DENTRY); if (bit_pos >= NR_INLINE_DENTRY) { err = f2fs_convert_inline_dir(dir, ipage, dentry_blk); - if (!err) - err = -EAGAIN; + if (err) + return err; + err = -EAGAIN; goto out; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2550868dc651..35aae65b3e5d 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -12,7 +12,6 @@ #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> #include <linux/writeback.h> -#include <linux/bitops.h> #include "f2fs.h" #include "node.h" @@ -34,8 +33,8 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & FS_DIRSYNC_FL) new_fl |= S_DIRSYNC; - set_mask_bits(&inode->i_flags, - S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl); + inode_set_flags(inode, new_fl, + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) @@ -139,7 +138,7 @@ static int do_read_inode(struct inode *inode) fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; - f2fs_init_extent_cache(inode, &ri->i_ext); + f2fs_init_extent_tree(inode, &ri->i_ext); get_inline_info(fi, ri); @@ -155,6 +154,7 @@ static int do_read_inode(struct inode *inode) f2fs_put_page(node_page, 1); + stat_inc_inline_xattr(inode); stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); @@ -237,10 +237,11 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_size = cpu_to_le64(i_size_read(inode)); ri->i_blocks = cpu_to_le64(inode->i_blocks); - read_lock(&F2FS_I(inode)->ext_lock); - set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext); - read_unlock(&F2FS_I(inode)->ext_lock); - + if (F2FS_I(inode)->extent_tree) + set_raw_extent(&F2FS_I(inode)->extent_tree->largest, + &ri->i_ext); + else + memset(&ri->i_ext, 0, sizeof(ri->i_ext)); set_raw_inline(F2FS_I(inode), ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); @@ -314,7 +315,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) void f2fs_evict_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - nid_t xnid = F2FS_I(inode)->i_xattr_nid; + struct f2fs_inode_info *fi = F2FS_I(inode); + nid_t xnid = fi->i_xattr_nid; + int err = 0; /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) @@ -330,41 +333,62 @@ void f2fs_evict_inode(struct inode *inode) f2fs_bug_on(sbi, get_dirty_pages(inode)); remove_dirty_dir_inode(inode); + f2fs_destroy_extent_tree(inode); + if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; sb_start_intwrite(inode->i_sb); - set_inode_flag(F2FS_I(inode), FI_NO_ALLOC); + set_inode_flag(fi, FI_NO_ALLOC); i_size_write(inode, 0); if (F2FS_HAS_BLOCKS(inode)) - f2fs_truncate(inode); + err = f2fs_truncate(inode, true); - f2fs_lock_op(sbi); - remove_inode_page(inode); - f2fs_unlock_op(sbi); + if (!err) { + f2fs_lock_op(sbi); + err = remove_inode_page(inode); + f2fs_unlock_op(sbi); + } sb_end_intwrite(inode->i_sb); no_delete: + stat_dec_inline_xattr(inode); stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); - /* update extent info in inode */ - if (inode->i_nlink) - f2fs_preserve_extent_tree(inode); - f2fs_destroy_extent_tree(inode); - invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); - if (is_inode_flag_set(F2FS_I(inode), FI_APPEND_WRITE)) + if (is_inode_flag_set(fi, FI_APPEND_WRITE)) add_dirty_inode(sbi, inode->i_ino, APPEND_INO); - if (is_inode_flag_set(F2FS_I(inode), FI_UPDATE_WRITE)) + if (is_inode_flag_set(fi, FI_UPDATE_WRITE)) add_dirty_inode(sbi, inode->i_ino, UPDATE_INO); + if (is_inode_flag_set(fi, FI_FREE_NID)) { + if (err && err != -ENOENT) + alloc_nid_done(sbi, inode->i_ino); + else + alloc_nid_failed(sbi, inode->i_ino); + clear_inode_flag(fi, FI_FREE_NID); + } + + if (err && err != -ENOENT) { + if (!exist_written_data(sbi, inode->i_ino, ORPHAN_INO)) { + /* + * get here because we failed to release resource + * of inode previously, reminder our user to run fsck + * for fixing. + */ + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "inode (ino:%lu) resource leak, run fsck " + "to fix this issue!", inode->i_ino); + } + } out_clear: #ifdef CONFIG_F2FS_FS_ENCRYPTION - if (F2FS_I(inode)->i_crypt_info) - f2fs_free_encryption_info(inode, F2FS_I(inode)->i_crypt_info); + if (fi->i_crypt_info) + f2fs_free_encryption_info(inode, fi->i_crypt_info); #endif clear_inode(inode); } @@ -373,6 +397,7 @@ out_clear: void handle_failed_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int err = 0; clear_nlink(inode); make_bad_inode(inode); @@ -380,13 +405,29 @@ void handle_failed_inode(struct inode *inode) i_size_write(inode, 0); if (F2FS_HAS_BLOCKS(inode)) - f2fs_truncate(inode); + err = f2fs_truncate(inode, false); - remove_inode_page(inode); + if (!err) + err = remove_inode_page(inode); + + /* + * if we skip truncate_node in remove_inode_page bacause we failed + * before, it's better to find another way to release resource of + * this inode (e.g. valid block count, node block or nid). Here we + * choose to add this inode to orphan list, so that we can call iput + * for releasing in orphan recovery flow. + * + * Note: we should add inode to orphan list before f2fs_unlock_op() + * so we can prevent losing this orphan when encoutering checkpoint + * and following suddenly power-off. + */ + if (err && err != -ENOENT) { + err = acquire_orphan_inode(sbi); + if (!err) + add_orphan_inode(sbi, inode->i_ino); + } - clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); - clear_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); - alloc_nid_failed(sbi, inode->i_ino); + set_inode_flag(F2FS_I(inode), FI_FREE_NID); f2fs_unlock_op(sbi); /* iput will drop the inode object */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index fdbae21ee8fb..a680bf38e4f0 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -53,7 +53,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (err) { err = -EINVAL; nid_free = true; - goto out; + goto fail; } /* If the directory encrypted, then we should encrypt the inode. */ @@ -65,6 +65,9 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (f2fs_may_inline_dentry(inode)) set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); + f2fs_init_extent_tree(inode, NULL); + + stat_inc_inline_xattr(inode); stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); @@ -72,15 +75,12 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) mark_inode_dirty(inode); return inode; -out: - clear_nlink(inode); - unlock_new_inode(inode); fail: trace_f2fs_new_inode(inode, err); make_bad_inode(inode); - iput(inode); if (nid_free) - alloc_nid_failed(sbi, ino); + set_inode_flag(F2FS_I(inode), FI_FREE_NID); + iput(inode); return ERR_PTR(err); } @@ -89,7 +89,14 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) size_t slen = strlen(s); size_t sublen = strlen(sub); - if (sublen > slen) + /* + * filename format of multimedia file should be defined as: + * "filename + '.' + extension". + */ + if (slen < sublen + 2) + return 0; + + if (s[slen - sublen - 1] != '.') return 0; return !strncasecmp(s + slen - sublen, sub, sublen); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7dd63b794bfb..27d1a74dd6f3 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -159,7 +159,7 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, head = radix_tree_lookup(&nm_i->nat_set_root, set); if (!head) { - head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC); + head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS); INIT_LIST_HEAD(&head->entry_list); INIT_LIST_HEAD(&head->set_list); @@ -246,7 +246,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) { struct nat_entry *new; - new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC); + new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS); f2fs_radix_tree_insert(&nm_i->nat_root, nid, new); memset(new, 0, sizeof(struct nat_entry)); nat_set_nid(new, nid); @@ -306,6 +306,10 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { unsigned char version = nat_get_version(e); nat_set_version(e, inc_node_version(version)); + + /* in order to reuse the nid */ + if (nm_i->next_scan_nid > ni->nid) + nm_i->next_scan_nid = ni->nid; } /* change address */ @@ -328,11 +332,11 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); + int nr = nr_shrink; - if (available_free_memory(sbi, NAT_ENTRIES)) + if (!down_write_trylock(&nm_i->nat_tree_lock)) return 0; - down_write(&nm_i->nat_tree_lock); while (nr_shrink && !list_empty(&nm_i->nat_entries)) { struct nat_entry *ne; ne = list_first_entry(&nm_i->nat_entries, @@ -341,7 +345,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) nr_shrink--; } up_write(&nm_i->nat_tree_lock); - return nr_shrink; + return nr - nr_shrink; } /* @@ -898,17 +902,20 @@ int truncate_xattr_node(struct inode *inode, struct page *page) * Caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). */ -void remove_inode_page(struct inode *inode) +int remove_inode_page(struct inode *inode) { struct dnode_of_data dn; + int err; set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); - if (get_dnode_of_data(&dn, 0, LOOKUP_NODE)) - return; + err = get_dnode_of_data(&dn, 0, LOOKUP_NODE); + if (err) + return err; - if (truncate_xattr_node(inode, dn.inode_page)) { + err = truncate_xattr_node(inode, dn.inode_page); + if (err) { f2fs_put_dnode(&dn); - return; + return err; } /* remove potential inline_data blocks */ @@ -922,6 +929,7 @@ void remove_inode_page(struct inode *inode) /* will put inode & node pages */ truncate_node(&dn); + return 0; } struct page *new_inode_page(struct inode *inode) @@ -991,8 +999,7 @@ fail: /* * Caller should do after getting the following values. * 0: f2fs_put_page(page, 0) - * LOCKED_PAGE: f2fs_put_page(page, 1) - * error: nothing + * LOCKED_PAGE or error: f2fs_put_page(page, 1) */ static int read_node_page(struct page *page, int rw) { @@ -1010,7 +1017,6 @@ static int read_node_page(struct page *page, int rw) if (unlikely(ni.blk_addr == NULL_ADDR)) { ClearPageUptodate(page); - f2fs_put_page(page, 1); return -ENOENT; } @@ -1041,10 +1047,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) return; err = read_node_page(apage, READA); - if (err == 0) - f2fs_put_page(apage, 0); - else if (err == LOCKED_PAGE) - f2fs_put_page(apage, 1); + f2fs_put_page(apage, err ? 1 : 0); } struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) @@ -1057,10 +1060,12 @@ repeat: return ERR_PTR(-ENOMEM); err = read_node_page(page, READ_SYNC); - if (err < 0) + if (err < 0) { + f2fs_put_page(page, 1); return ERR_PTR(err); - else if (err != LOCKED_PAGE) + } else if (err != LOCKED_PAGE) { lock_page(page); + } if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { ClearPageUptodate(page); @@ -1096,10 +1101,12 @@ repeat: return ERR_PTR(-ENOMEM); err = read_node_page(page, READ_SYNC); - if (err < 0) + if (err < 0) { + f2fs_put_page(page, 1); return ERR_PTR(err); - else if (err == LOCKED_PAGE) + } else if (err == LOCKED_PAGE) { goto page_hit; + } blk_start_plug(&plug); @@ -1533,7 +1540,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi) if (unlikely(nid >= nm_i->max_nid)) nid = 0; - if (i++ == FREE_NID_PAGES) + if (++i >= FREE_NID_PAGES) break; } @@ -1570,6 +1577,8 @@ retry: /* We should not use stale free nids created by build_free_nids */ if (nm_i->fcnt && !on_build_free_nids(nm_i)) { + struct node_info ni; + f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); list_for_each_entry(i, &nm_i->free_nid_list, list) if (i->state == NID_NEW) @@ -1580,6 +1589,13 @@ retry: i->state = NID_ALLOC; nm_i->fcnt--; spin_unlock(&nm_i->free_nid_list_lock); + + /* check nid is allocated already */ + get_node_info(sbi, *nid, &ni); + if (ni.blk_addr != NULL_ADDR) { + alloc_nid_done(sbi, *nid); + goto retry; + } return true; } spin_unlock(&nm_i->free_nid_list_lock); @@ -1636,6 +1652,32 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } +int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i, *next; + int nr = nr_shrink; + + if (!mutex_trylock(&nm_i->build_lock)) + return 0; + + spin_lock(&nm_i->free_nid_list_lock); + list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { + if (nr_shrink <= 0 || nm_i->fcnt <= NAT_ENTRY_PER_BLOCK) + break; + if (i->state == NID_ALLOC) + continue; + __del_from_free_nid_list(nm_i, i); + kmem_cache_free(free_nid_slab, i); + nm_i->fcnt--; + nr_shrink--; + } + spin_unlock(&nm_i->free_nid_list_lock); + mutex_unlock(&nm_i->build_lock); + + return nr - nr_shrink; +} + void recover_inline_xattr(struct inode *inode, struct page *page) { void *src_addr, *dst_addr; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 24a8c1d4f45f..faec2ca004b9 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -399,14 +399,35 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, f2fs_bug_on(sbi, ni.ino != ino_of_node(page)); f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page)); - for (; start < end; start++) { + for (; start < end; start++, dn.ofs_in_node++) { block_t src, dest; src = datablock_addr(dn.node_page, dn.ofs_in_node); dest = datablock_addr(page, dn.ofs_in_node); - if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR && - is_valid_blkaddr(sbi, dest, META_POR)) { + /* skip recovering if dest is the same as src */ + if (src == dest) + continue; + + /* dest is invalid, just invalidate src block */ + if (dest == NULL_ADDR) { + truncate_data_blocks_range(&dn, 1); + continue; + } + + /* + * dest is reserved block, invalidate src block + * and then reserve one new block in dnode page. + */ + if (dest == NEW_ADDR) { + truncate_data_blocks_range(&dn, 1); + err = reserve_new_block(&dn); + f2fs_bug_on(sbi, err); + continue; + } + + /* dest is valid block, try to recover from src to dest */ + if (is_valid_blkaddr(sbi, dest, META_POR)) { if (src == NULL_ADDR) { err = reserve_new_block(&dn); @@ -424,7 +445,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, ni.version, false); recovered++; } - dn.ofs_in_node++; } if (IS_INODE(dn.node_page)) @@ -525,14 +545,12 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&inode_list); - /* step #1: find fsynced inode numbers */ - set_sbi_flag(sbi, SBI_POR_DOING); - /* prevent checkpoint */ mutex_lock(&sbi->cp_mutex); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list); if (err) goto out; @@ -561,11 +579,20 @@ out: clear_sbi_flag(sbi, SBI_POR_DOING); if (err) { - discard_next_dnode(sbi, blkaddr); + bool invalidate = false; + + if (discard_next_dnode(sbi, blkaddr)) + invalidate = true; /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) sync_meta_pages(sbi, META, LONG_MAX); + + /* invalidate temporary meta page */ + if (invalidate) + invalidate_mapping_pages(META_MAPPING(sbi), + blkaddr, blkaddr); + set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); mutex_unlock(&sbi->cp_mutex); } else if (need_writecp) { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1eb343768781..78e6d0696847 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -197,28 +197,20 @@ void register_inmem_page(struct inode *inode, struct page *page) { struct f2fs_inode_info *fi = F2FS_I(inode); struct inmem_pages *new; - int err; - SetPagePrivate(page); f2fs_trace_pid(page); + set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); + SetPagePrivate(page); + new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); /* add atomic page indices to the list */ new->page = page; INIT_LIST_HEAD(&new->list); -retry: + /* increase reference count with clean state */ mutex_lock(&fi->inmem_lock); - err = radix_tree_insert(&fi->inmem_root, page->index, new); - if (err == -EEXIST) { - mutex_unlock(&fi->inmem_lock); - kmem_cache_free(inmem_entry_slab, new); - return; - } else if (err) { - mutex_unlock(&fi->inmem_lock); - goto retry; - } get_page(page); list_add_tail(&new->list, &fi->inmem_pages); inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); @@ -227,7 +219,7 @@ retry: trace_f2fs_register_inmem_page(page, INMEM); } -void commit_inmem_pages(struct inode *inode, bool abort) +int commit_inmem_pages(struct inode *inode, bool abort) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -239,6 +231,7 @@ void commit_inmem_pages(struct inode *inode, bool abort) .rw = WRITE_SYNC | REQ_PRIO, .encrypted_page = NULL, }; + int err = 0; /* * The abort is true only when f2fs_evict_inode is called. @@ -254,23 +247,29 @@ void commit_inmem_pages(struct inode *inode, bool abort) mutex_lock(&fi->inmem_lock); list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { + lock_page(cur->page); if (!abort) { - lock_page(cur->page); if (cur->page->mapping == inode->i_mapping) { + set_page_dirty(cur->page); f2fs_wait_on_page_writeback(cur->page, DATA); if (clear_page_dirty_for_io(cur->page)) inode_dec_dirty_pages(inode); trace_f2fs_commit_inmem_page(cur->page, INMEM); fio.page = cur->page; - do_write_data_page(&fio); + err = do_write_data_page(&fio); submit_bio = true; + if (err) { + unlock_page(cur->page); + break; + } } - f2fs_put_page(cur->page, 1); } else { trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP); - put_page(cur->page); } - radix_tree_delete(&fi->inmem_root, cur->page->index); + set_page_private(cur->page, 0); + ClearPagePrivate(cur->page); + f2fs_put_page(cur->page, 1); + list_del(&cur->list); kmem_cache_free(inmem_entry_slab, cur); dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); @@ -282,6 +281,7 @@ void commit_inmem_pages(struct inode *inode, bool abort) if (submit_bio) f2fs_submit_merged_bio(sbi, DATA, WRITE); } + return err; } /* @@ -303,10 +303,18 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi) void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) { /* try to shrink extent cache when there is no enough memory */ - f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); + if (!available_free_memory(sbi, EXTENT_CACHE)) + f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); + + /* check the # of cached NAT entries */ + if (!available_free_memory(sbi, NAT_ENTRIES)) + try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); + + if (!available_free_memory(sbi, FREE_NIDS)) + try_to_free_nids(sbi, NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES); - /* check the # of cached NAT entries and prefree segments */ - if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) || + /* checkpoint is the only way to shrink partial cached entries */ + if (!available_free_memory(sbi, NAT_ENTRIES) || excess_prefree_segs(sbi) || !available_free_memory(sbi, INO_ENTRIES)) f2fs_sync_fs(sbi->sb, true); @@ -322,10 +330,12 @@ repeat: return 0; if (!llist_empty(&fcc->issue_list)) { - struct bio *bio = bio_alloc(GFP_NOIO, 0); + struct bio *bio; struct flush_cmd *cmd, *next; int ret; + bio = f2fs_bio_alloc(0); + fcc->dispatch_list = llist_del_all(&fcc->issue_list); fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); @@ -357,8 +367,15 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE)) - return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL); + if (!test_opt(sbi, FLUSH_MERGE)) { + struct bio *bio = f2fs_bio_alloc(0); + int ret; + + bio->bi_bdev = sbi->sb->s_bdev; + ret = submit_bio_wait(WRITE_FLUSH, bio); + bio_put(bio); + return ret; + } init_completion(&cmd.wait); @@ -502,7 +519,7 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); } -void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) +bool discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) { int err = -ENOTSUPP; @@ -512,13 +529,16 @@ void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) unsigned int offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); if (f2fs_test_bit(offset, se->discard_map)) - return; + return false; err = f2fs_issue_discard(sbi, blkaddr, 1); } - if (err) + if (err) { update_meta_page(sbi, NULL, blkaddr); + return true; + } + return false; } static void __add_discard_entry(struct f2fs_sb_info *sbi, @@ -1217,7 +1237,8 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_lock(&sit_i->sentry_lock); /* direct_io'ed data is aligned to the segment for better performance */ - if (direct_io && curseg->next_blkoff) + if (direct_io && curseg->next_blkoff && + !has_not_enough_free_secs(sbi, 0)) __allocate_new_segments(sbi, type); *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); @@ -1732,7 +1753,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, static struct sit_entry_set *grab_sit_entry_set(void) { struct sit_entry_set *ses = - f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_ATOMIC); + f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_NOFS); ses->entry_cnt = 0; INIT_LIST_HEAD(&ses->set_list); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 79e7b879a753..b6e4ed15c698 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -177,6 +177,15 @@ struct segment_allocation { void (*allocate_segment)(struct f2fs_sb_info *, int, bool); }; +/* + * this value is set in page as a private data which indicate that + * the page is atomically written, and it is in inmem_pages list. + */ +#define ATOMIC_WRITTEN_PAGE 0x0000ffff + +#define IS_ATOMIC_WRITTEN_PAGE(page) \ + (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) + struct inmem_pages { struct list_head list; struct page *page; @@ -555,16 +564,15 @@ static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type) return curseg->next_blkoff; } -#ifdef CONFIG_F2FS_CHECK_FS static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) { - BUG_ON(segno > TOTAL_SEGS(sbi) - 1); + f2fs_bug_on(sbi, segno > TOTAL_SEGS(sbi) - 1); } static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) { - BUG_ON(blk_addr < SEG0_BLKADDR(sbi)); - BUG_ON(blk_addr >= MAX_BLKADDR(sbi)); + f2fs_bug_on(sbi, blk_addr < SEG0_BLKADDR(sbi) + || blk_addr >= MAX_BLKADDR(sbi)); } /* @@ -573,16 +581,11 @@ static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) static inline void check_block_count(struct f2fs_sb_info *sbi, int segno, struct f2fs_sit_entry *raw_sit) { +#ifdef CONFIG_F2FS_CHECK_FS bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; int valid_blocks = 0; int cur_pos = 0, next_pos; - /* check segment usage */ - BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg); - - /* check boundary of a given segment number */ - BUG_ON(segno > TOTAL_SEGS(sbi) - 1); - /* check bitmap with valid block count */ do { if (is_valid) { @@ -598,35 +601,11 @@ static inline void check_block_count(struct f2fs_sb_info *sbi, is_valid = !is_valid; } while (cur_pos < sbi->blocks_per_seg); BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks); -} -#else -static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) -{ - if (segno > TOTAL_SEGS(sbi) - 1) - set_sbi_flag(sbi, SBI_NEED_FSCK); -} - -static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) -{ - if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi)) - set_sbi_flag(sbi, SBI_NEED_FSCK); -} - -/* - * Summary block is always treated as an invalid block - */ -static inline void check_block_count(struct f2fs_sb_info *sbi, - int segno, struct f2fs_sit_entry *raw_sit) -{ - /* check segment usage */ - if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg) - set_sbi_flag(sbi, SBI_NEED_FSCK); - - /* check boundary of a given segment number */ - if (segno > TOTAL_SEGS(sbi) - 1) - set_sbi_flag(sbi, SBI_NEED_FSCK); -} #endif + /* check segment usage, and check boundary of a given segment number */ + f2fs_bug_on(sbi, GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg + || segno > TOTAL_SEGS(sbi) - 1); +} static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, unsigned int start) diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c new file mode 100644 index 000000000000..da0d8e0b55a5 --- /dev/null +++ b/fs/f2fs/shrinker.c @@ -0,0 +1,139 @@ +/* + * f2fs shrinker support + * the basic infra was copied from fs/ubifs/shrinker.c + * + * Copyright (c) 2015 Motorola Mobility + * Copyright (c) 2015 Jaegeuk Kim <jaegeuk@kernel.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> + +#include "f2fs.h" + +static LIST_HEAD(f2fs_list); +static DEFINE_SPINLOCK(f2fs_list_lock); +static unsigned int shrinker_run_no; + +static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) +{ + return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; +} + +static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) +{ + if (NM_I(sbi)->fcnt > NAT_ENTRY_PER_BLOCK) + return NM_I(sbi)->fcnt - NAT_ENTRY_PER_BLOCK; + return 0; +} + +static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) +{ + return sbi->total_ext_tree + atomic_read(&sbi->total_ext_node); +} + +unsigned long f2fs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct f2fs_sb_info *sbi; + struct list_head *p; + unsigned long count = 0; + + spin_lock(&f2fs_list_lock); + p = f2fs_list.next; + while (p != &f2fs_list) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + /* count extent cache entries */ + count += __count_extent_cache(sbi); + + /* shrink clean nat cache entries */ + count += __count_nat_entries(sbi); + + /* count free nids cache entries */ + count += __count_free_nids(sbi); + + spin_lock(&f2fs_list_lock); + p = p->next; + mutex_unlock(&sbi->umount_mutex); + } + spin_unlock(&f2fs_list_lock); + return count; +} + +unsigned long f2fs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + unsigned long nr = sc->nr_to_scan; + struct f2fs_sb_info *sbi; + struct list_head *p; + unsigned int run_no; + unsigned long freed = 0; + + spin_lock(&f2fs_list_lock); + do { + run_no = ++shrinker_run_no; + } while (run_no == 0); + p = f2fs_list.next; + while (p != &f2fs_list) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + if (sbi->shrinker_run_no == run_no) + break; + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + sbi->shrinker_run_no = run_no; + + /* shrink extent cache entries */ + freed += f2fs_shrink_extent_tree(sbi, nr >> 1); + + /* shrink clean nat cache entries */ + if (freed < nr) + freed += try_to_free_nats(sbi, nr - freed); + + /* shrink free nids cache entries */ + if (freed < nr) + freed += try_to_free_nids(sbi, nr - freed); + + spin_lock(&f2fs_list_lock); + p = p->next; + list_move_tail(&sbi->s_list, &f2fs_list); + mutex_unlock(&sbi->umount_mutex); + if (freed >= nr) + break; + } + spin_unlock(&f2fs_list_lock); + return freed; +} + +void f2fs_join_shrinker(struct f2fs_sb_info *sbi) +{ + spin_lock(&f2fs_list_lock); + list_add_tail(&sbi->s_list, &f2fs_list); + spin_unlock(&f2fs_list_lock); +} + +void f2fs_leave_shrinker(struct f2fs_sb_info *sbi) +{ + f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi)); + + spin_lock(&f2fs_list_lock); + list_del(&sbi->s_list); + spin_unlock(&f2fs_list_lock); +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a06b0b46fe69..f79478115d37 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -39,6 +39,13 @@ static struct proc_dir_entry *f2fs_proc_root; static struct kmem_cache *f2fs_inode_cachep; static struct kset *f2fs_kset; +/* f2fs-wide shrinker description */ +static struct shrinker f2fs_shrinker_info = { + .scan_objects = f2fs_shrink_scan, + .count_objects = f2fs_shrink_count, + .seeks = DEFAULT_SEEKS, +}; + enum { Opt_gc_background, Opt_disable_roll_forward, @@ -58,6 +65,7 @@ enum { Opt_nobarrier, Opt_fastboot, Opt_extent_cache, + Opt_noextent_cache, Opt_noinline_data, Opt_err, }; @@ -81,6 +89,7 @@ static match_table_t f2fs_tokens = { {Opt_nobarrier, "nobarrier"}, {Opt_fastboot, "fastboot"}, {Opt_extent_cache, "extent_cache"}, + {Opt_noextent_cache, "noextent_cache"}, {Opt_noinline_data, "noinline_data"}, {Opt_err, NULL}, }; @@ -382,6 +391,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_extent_cache: set_opt(sbi, EXTENT_CACHE); break; + case Opt_noextent_cache: + clear_opt(sbi, EXTENT_CACHE); + break; case Opt_noinline_data: clear_opt(sbi, INLINE_DATA); break; @@ -410,9 +422,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; - rwlock_init(&fi->ext_lock); init_rwsem(&fi->i_sem); - INIT_RADIX_TREE(&fi->inmem_root, GFP_NOFS); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); @@ -441,17 +451,22 @@ static int f2fs_drop_inode(struct inode *inode) */ if (!inode_unhashed(inode) && inode->i_state & I_SYNC) { if (!inode->i_nlink && !is_bad_inode(inode)) { + /* to avoid evict_inode call simultaneously */ + atomic_inc(&inode->i_count); spin_unlock(&inode->i_lock); /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) commit_inmem_pages(inode, true); + /* should remain fi->extent_tree for writepage */ + f2fs_destroy_extent_node(inode); + sb_start_intwrite(inode->i_sb); i_size_write(inode, 0); if (F2FS_HAS_BLOCKS(inode)) - f2fs_truncate(inode); + f2fs_truncate(inode, true); sb_end_intwrite(inode->i_sb); @@ -461,6 +476,7 @@ static int f2fs_drop_inode(struct inode *inode) F2FS_I(inode)->i_crypt_info); #endif spin_lock(&inode->i_lock); + atomic_dec(&inode->i_count); } return 0; } @@ -498,9 +514,11 @@ static void f2fs_put_super(struct super_block *sb) } kobject_del(&sbi->s_kobj); - f2fs_destroy_stats(sbi); stop_gc_thread(sbi); + /* prevent remaining shrinker jobs */ + mutex_lock(&sbi->umount_mutex); + /* * We don't need to do checkpoint when superblock is clean. * But, the previous checkpoint was not done by umount, it needs to do @@ -514,6 +532,9 @@ static void f2fs_put_super(struct super_block *sb) write_checkpoint(sbi, &cpc); } + /* write_checkpoint can update stat informaion */ + f2fs_destroy_stats(sbi); + /* * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. @@ -521,6 +542,9 @@ static void f2fs_put_super(struct super_block *sb) release_dirty_inode(sbi); release_discard_addrs(sbi); + f2fs_leave_shrinker(sbi); + mutex_unlock(&sbi->umount_mutex); + iput(sbi->node_inode); iput(sbi->meta_inode); @@ -647,6 +671,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",fastboot"); if (test_opt(sbi, EXTENT_CACHE)) seq_puts(seq, ",extent_cache"); + else + seq_puts(seq, ",noextent_cache"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -667,7 +693,7 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) struct seg_entry *se = get_seg_entry(sbi, i); if ((i % 10) == 0) - seq_printf(seq, "%-5d", i); + seq_printf(seq, "%-10d", i); seq_printf(seq, "%d|%-3u", se->type, get_valid_blocks(sbi, i, 1)); if ((i % 10) == 9 || i == (total_segs - 1)) @@ -699,6 +725,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, BG_GC); set_opt(sbi, INLINE_DATA); + set_opt(sbi, EXTENT_CACHE); #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); @@ -970,6 +997,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->dir_level = DEF_DIR_LEVEL; clear_sbi_flag(sbi, SBI_NEED_FSCK); + + INIT_LIST_HEAD(&sbi->s_list); + mutex_init(&sbi->umount_mutex); } /* @@ -1135,7 +1165,9 @@ try_onemore: mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); - clear_sbi_flag(sbi, SBI_POR_DOING); + + /* disallow all the data/node/meta page writes */ + set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); init_rwsem(&sbi->read_io.io_rwsem); @@ -1212,8 +1244,12 @@ try_onemore: goto free_nm; } + f2fs_join_shrinker(sbi); + /* if there are nt orphan nodes free them */ - recover_orphan_inodes(sbi); + err = recover_orphan_inodes(sbi); + if (err) + goto free_node_inode; /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); @@ -1275,6 +1311,8 @@ try_onemore: goto free_kobj; } } + /* recover_fsync_data() cleared this already */ + clear_sbi_flag(sbi, SBI_POR_DOING); /* * If filesystem is not mounted as read-only then @@ -1308,7 +1346,10 @@ free_root_inode: dput(sb->s_root); sb->s_root = NULL; free_node_inode: + mutex_lock(&sbi->umount_mutex); + f2fs_leave_shrinker(sbi); iput(sbi->node_inode); + mutex_unlock(&sbi->umount_mutex); free_nm: destroy_node_manager(sbi); free_sm: @@ -1404,13 +1445,20 @@ static int __init init_f2fs_fs(void) err = f2fs_init_crypto(); if (err) goto free_kset; - err = register_filesystem(&f2fs_fs_type); + + err = register_shrinker(&f2fs_shrinker_info); if (err) goto free_crypto; + + err = register_filesystem(&f2fs_fs_type); + if (err) + goto free_shrinker; f2fs_create_root_stats(); f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); return 0; +free_shrinker: + unregister_shrinker(&f2fs_shrinker_info); free_crypto: f2fs_exit_crypto(); free_kset: @@ -1433,6 +1481,7 @@ static void __exit exit_f2fs_fs(void) { remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); + unregister_shrinker(&f2fs_shrinker_info); unregister_filesystem(&f2fs_fs_type); f2fs_exit_crypto(); destroy_extent_cache(); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 07449b980acb..4de2286c0e4d 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -499,9 +499,12 @@ static int __f2fs_setxattr(struct inode *inode, int index, len = strlen(name); - if (len > F2FS_NAME_LEN || size > MAX_VALUE_LEN(inode)) + if (len > F2FS_NAME_LEN) return -ERANGE; + if (size > MAX_VALUE_LEN(inode)) + return -E2BIG; + base_addr = read_all_xattrs(inode, ipage); if (!base_addr) goto exit; diff --git a/fs/file.c b/fs/file.c index 93c5f89c248b..6c672ad329e9 100644 --- a/fs/file.c +++ b/fs/file.c @@ -147,6 +147,13 @@ static int expand_fdtable(struct files_struct *files, int nr) spin_unlock(&files->file_lock); new_fdt = alloc_fdtable(nr); + + /* make sure all __fd_install() have seen resize_in_progress + * or have finished their rcu_read_lock_sched() section. + */ + if (atomic_read(&files->count) > 1) + synchronize_sched(); + spin_lock(&files->file_lock); if (!new_fdt) return -ENOMEM; @@ -158,21 +165,14 @@ static int expand_fdtable(struct files_struct *files, int nr) __free_fdtable(new_fdt); return -EMFILE; } - /* - * Check again since another task may have expanded the fd table while - * we dropped the lock - */ cur_fdt = files_fdtable(files); - if (nr >= cur_fdt->max_fds) { - /* Continue as planned */ - copy_fdtable(new_fdt, cur_fdt); - rcu_assign_pointer(files->fdt, new_fdt); - if (cur_fdt != &files->fdtab) - call_rcu(&cur_fdt->rcu, free_fdtable_rcu); - } else { - /* Somebody else expanded, so undo our attempt */ - __free_fdtable(new_fdt); - } + BUG_ON(nr < cur_fdt->max_fds); + copy_fdtable(new_fdt, cur_fdt); + rcu_assign_pointer(files->fdt, new_fdt); + if (cur_fdt != &files->fdtab) + call_rcu(&cur_fdt->rcu, free_fdtable_rcu); + /* coupled with smp_rmb() in __fd_install() */ + smp_wmb(); return 1; } @@ -185,21 +185,38 @@ static int expand_fdtable(struct files_struct *files, int nr) * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, int nr) + __releases(files->file_lock) + __acquires(files->file_lock) { struct fdtable *fdt; + int expanded = 0; +repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) - return 0; + return expanded; /* Can we expand? */ if (nr >= sysctl_nr_open) return -EMFILE; + if (unlikely(files->resize_in_progress)) { + spin_unlock(&files->file_lock); + expanded = 1; + wait_event(files->resize_wait, !files->resize_in_progress); + spin_lock(&files->file_lock); + goto repeat; + } + /* All good, so we try */ - return expand_fdtable(files, nr); + files->resize_in_progress = true; + expanded = expand_fdtable(files, nr); + files->resize_in_progress = false; + + wake_up_all(&files->resize_wait); + return expanded; } static inline void __set_close_on_exec(int fd, struct fdtable *fdt) @@ -256,6 +273,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); + newf->resize_in_progress = false; + init_waitqueue_head(&newf->resize_wait); newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; @@ -553,11 +572,21 @@ void __fd_install(struct files_struct *files, unsigned int fd, struct file *file) { struct fdtable *fdt; - spin_lock(&files->file_lock); - fdt = files_fdtable(files); + + might_sleep(); + rcu_read_lock_sched(); + + while (unlikely(files->resize_in_progress)) { + rcu_read_unlock_sched(); + wait_event(files->resize_wait, !files->resize_in_progress); + rcu_read_lock_sched(); + } + /* coupled with smp_wmb() in expand_fdtable() */ + smp_rmb(); + fdt = rcu_dereference_sched(files->fdt); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); - spin_unlock(&files->file_lock); + rcu_read_unlock_sched(); } void fd_install(unsigned int fd, struct file *file) @@ -635,11 +664,17 @@ static struct file *__fget(unsigned int fd, fmode_t mask) struct file *file; rcu_read_lock(); +loop: file = fcheck_files(files, fd); if (file) { - /* File object ref couldn't be taken */ - if ((file->f_mode & mask) || !get_file_rcu(file)) + /* File object ref couldn't be taken. + * dup2() atomicity guarantee is the reason + * we loop to catch the new file (or NULL pointer) + */ + if (file->f_mode & mask) file = NULL; + else if (!get_file_rcu(file)) + goto loop; } rcu_read_unlock(); diff --git a/fs/file_table.c b/fs/file_table.c index 294174dcc226..ad17e05ebf95 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -20,12 +20,12 @@ #include <linux/cdev.h> #include <linux/fsnotify.h> #include <linux/sysctl.h> -#include <linux/lglock.h> #include <linux/percpu_counter.h> #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/task_work.h> #include <linux/ima.h> +#include <linux/swap.h> #include <linux/atomic.h> @@ -309,19 +309,24 @@ void put_filp(struct file *file) } } -void __init files_init(unsigned long mempages) +void __init files_init(void) { - unsigned long n; - filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + percpu_counter_init(&nr_files, 0, GFP_KERNEL); +} - /* - * One file with associated inode and dcache is very roughly 1K. - * Per default don't use more than 10% of our memory for files. - */ +/* + * One file with associated inode and dcache is very roughly 1K. Per default + * do not use more than 10% of our memory for files. + */ +void __init files_maxfiles_init(void) +{ + unsigned long n; + unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2; + + memreserve = min(memreserve, totalram_pages - 1); + n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; - n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); - percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index 99c7f0a37af4..1cff72df0389 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -61,13 +61,6 @@ const struct file_operations vxfs_dir_operations = { .iterate = vxfs_readdir, }; - -static inline u_long -dir_pages(struct inode *inode) -{ - return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -} - static inline u_long dir_blocks(struct inode *ip) { @@ -199,7 +192,7 @@ vxfs_inode_by_name(struct inode *dip, struct dentry *dp) * by @dp in @dip. * * Returns: - * A NULL-pointer on success, else an negative error code encoded + * A NULL-pointer on success, else a negative error code encoded * in the return pointer. */ static struct dentry * diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f0520bcf2094..24489126f8ca 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -53,8 +53,6 @@ struct wb_writeback_work { unsigned int for_background:1; unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ unsigned int auto_free:1; /* free on completion */ - unsigned int single_wait:1; - unsigned int single_done:1; enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ @@ -88,7 +86,7 @@ unsigned int dirtytime_expire_interval = 12 * 60 * 60; static inline struct inode *wb_inode(struct list_head *head) { - return list_entry(head, struct inode, i_wb_list); + return list_entry(head, struct inode, i_io_list); } /* @@ -125,22 +123,22 @@ static void wb_io_lists_depopulated(struct bdi_writeback *wb) } /** - * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list + * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list * @inode: inode to be moved * @wb: target bdi_writeback * @head: one of @wb->b_{dirty|io|more_io} * - * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io. + * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io. * Returns %true if @inode is the first occupant of the !dirty_time IO * lists; otherwise, %false. */ -static bool inode_wb_list_move_locked(struct inode *inode, +static bool inode_io_list_move_locked(struct inode *inode, struct bdi_writeback *wb, struct list_head *head) { assert_spin_locked(&wb->list_lock); - list_move(&inode->i_wb_list, head); + list_move(&inode->i_io_list, head); /* dirty_time doesn't count as dirty_io until expiration */ if (head != &wb->b_dirty_time) @@ -151,19 +149,19 @@ static bool inode_wb_list_move_locked(struct inode *inode, } /** - * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list + * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list * @inode: inode to be removed * @wb: bdi_writeback @inode is being removed from * * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and * clear %WB_has_dirty_io if all are empty afterwards. */ -static void inode_wb_list_del_locked(struct inode *inode, +static void inode_io_list_del_locked(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&wb->list_lock); - list_del_init(&inode->i_wb_list); + list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); } @@ -178,14 +176,11 @@ static void wb_wakeup(struct bdi_writeback *wb) static void wb_queue_work(struct bdi_writeback *wb, struct wb_writeback_work *work) { - trace_writeback_queue(wb->bdi, work); + trace_writeback_queue(wb, work); spin_lock_bh(&wb->work_lock); - if (!test_bit(WB_registered, &wb->state)) { - if (work->single_wait) - work->single_done = 1; + if (!test_bit(WB_registered, &wb->state)) goto out_unlock; - } if (work->done) atomic_inc(&work->done->cnt); list_add_tail(&work->list, &wb->work_list); @@ -351,7 +346,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) /* * Once I_FREEING is visible under i_lock, the eviction path owns - * the inode and we shouldn't modify ->i_wb_list. + * the inode and we shouldn't modify ->i_io_list. */ if (unlikely(inode->i_state & I_FREEING)) goto skip_switch; @@ -390,16 +385,16 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) * is always correct including from ->b_dirty_time. The transfer * preserves @inode->dirtied_when ordering. */ - if (!list_empty(&inode->i_wb_list)) { + if (!list_empty(&inode->i_io_list)) { struct inode *pos; - inode_wb_list_del_locked(inode, old_wb); + inode_io_list_del_locked(inode, old_wb); inode->i_wb = new_wb; - list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list) + list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) if (time_after_eq(inode->dirtied_when, pos->dirtied_when)) break; - inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev); + inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev); } else { inode->i_wb = new_wb; } @@ -702,10 +697,11 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page, else wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); } +EXPORT_SYMBOL_GPL(wbc_account_io); /** * inode_congested - test whether an inode is congested - * @inode: inode to test for congestion + * @inode: inode to test for congestion (may be NULL) * @cong_bits: mask of WB_[a]sync_congested bits to test * * Tests whether @inode is congested. @cong_bits is the mask of congestion @@ -715,6 +711,9 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page, * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg * associated with @inode is congested; otherwise, the root wb's congestion * state is used. + * + * @inode is allowed to be NULL as this function is often called on + * mapping->host which is NULL for the swapper space. */ int inode_congested(struct inode *inode, int cong_bits) { @@ -737,32 +736,6 @@ int inode_congested(struct inode *inode, int cong_bits) EXPORT_SYMBOL_GPL(inode_congested); /** - * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work - * @bdi: bdi the work item was issued to - * @work: work item to wait for - * - * Wait for the completion of @work which was issued to one of @bdi's - * bdi_writeback's. The caller must have set @work->single_wait before - * issuing it. This wait operates independently fo - * wb_wait_for_completion() and also disables automatic freeing of @work. - */ -static void wb_wait_for_single_work(struct backing_dev_info *bdi, - struct wb_writeback_work *work) -{ - if (WARN_ON_ONCE(!work->single_wait)) - return; - - wait_event(bdi->wb_waitq, work->single_done); - - /* - * Paired with smp_wmb() in wb_do_writeback() and ensures that all - * modifications to @work prior to assertion of ->single_done is - * visible to the caller once this function returns. - */ - smp_rmb(); -} - -/** * wb_split_bdi_pages - split nr_pages to write according to bandwidth * @wb: target bdi_writeback to split @nr_pages to * @nr_pages: number of pages to write for the whole bdi @@ -791,38 +764,6 @@ static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) } /** - * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb - * @wb: target bdi_writeback - * @base_work: source wb_writeback_work - * - * Try to make a clone of @base_work and issue it to @wb. If cloning - * succeeds, %true is returned; otherwise, @base_work is issued directly - * and %false is returned. In the latter case, the caller is required to - * wait for @base_work's completion using wb_wait_for_single_work(). - * - * A clone is auto-freed on completion. @base_work never is. - */ -static bool wb_clone_and_queue_work(struct bdi_writeback *wb, - struct wb_writeback_work *base_work) -{ - struct wb_writeback_work *work; - - work = kmalloc(sizeof(*work), GFP_ATOMIC); - if (work) { - *work = *base_work; - work->auto_free = 1; - work->single_wait = 0; - } else { - work = base_work; - work->auto_free = 0; - work->single_wait = 1; - } - work->single_done = 0; - wb_queue_work(wb, work); - return work != base_work; -} - -/** * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi * @bdi: target backing_dev_info * @base_work: wb_writeback_work to issue @@ -837,29 +778,51 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, struct wb_writeback_work *base_work, bool skip_if_busy) { - long nr_pages = base_work->nr_pages; - int next_blkcg_id = 0; + int next_memcg_id = 0; struct bdi_writeback *wb; struct wb_iter iter; might_sleep(); - - if (!bdi_has_dirty_io(bdi)) - return; restart: rcu_read_lock(); - bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) { - if (!wb_has_dirty_io(wb) || - (skip_if_busy && writeback_in_progress(wb))) + bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) { + DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done); + struct wb_writeback_work fallback_work; + struct wb_writeback_work *work; + long nr_pages; + + /* SYNC_ALL writes out I_DIRTY_TIME too */ + if (!wb_has_dirty_io(wb) && + (base_work->sync_mode == WB_SYNC_NONE || + list_empty(&wb->b_dirty_time))) + continue; + if (skip_if_busy && writeback_in_progress(wb)) continue; - base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages); - if (!wb_clone_and_queue_work(wb, base_work)) { - next_blkcg_id = wb->blkcg_css->id + 1; - rcu_read_unlock(); - wb_wait_for_single_work(bdi, base_work); - goto restart; + nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages); + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + *work = *base_work; + work->nr_pages = nr_pages; + work->auto_free = 1; + wb_queue_work(wb, work); + continue; } + + /* alloc failed, execute synchronously using on-stack fallback */ + work = &fallback_work; + *work = *base_work; + work->nr_pages = nr_pages; + work->auto_free = 0; + work->done = &fallback_work_done; + + wb_queue_work(wb, work); + + next_memcg_id = wb->memcg_css->id + 1; + rcu_read_unlock(); + wb_wait_for_completion(bdi, &fallback_work_done); + goto restart; } rcu_read_unlock(); } @@ -898,11 +861,8 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, { might_sleep(); - if (bdi_has_dirty_io(bdi) && - (!skip_if_busy || !writeback_in_progress(&bdi->wb))) { + if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) { base_work->auto_free = 0; - base_work->single_wait = 0; - base_work->single_done = 0; wb_queue_work(&bdi->wb, base_work); } } @@ -923,7 +883,7 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - trace_writeback_nowork(wb->bdi); + trace_writeback_nowork(wb); wb_wakeup(wb); return; } @@ -953,19 +913,19 @@ void wb_start_background_writeback(struct bdi_writeback *wb) * We just wake up the flusher thread. It will perform background * writeback as soon as there is no other work to do. */ - trace_writeback_wake_background(wb->bdi); + trace_writeback_wake_background(wb); wb_wakeup(wb); } /* * Remove the inode from the writeback list it is on. */ -void inode_wb_list_del(struct inode *inode) +void inode_io_list_del(struct inode *inode) { struct bdi_writeback *wb; wb = inode_to_wb_and_lock_list(inode); - inode_wb_list_del_locked(inode, wb); + inode_io_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); } @@ -987,7 +947,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } - inode_wb_list_move_locked(inode, wb, &wb->b_dirty); + inode_io_list_move_locked(inode, wb, &wb->b_dirty); } /* @@ -995,7 +955,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) */ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) { - inode_wb_list_move_locked(inode, wb, &wb->b_more_io); + inode_io_list_move_locked(inode, wb, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) @@ -1054,7 +1014,7 @@ static int move_expired_inodes(struct list_head *delaying_queue, if (older_than_this && inode_dirtied_after(inode, *older_than_this)) break; - list_move(&inode->i_wb_list, &tmp); + list_move(&inode->i_io_list, &tmp); moved++; if (flags & EXPIRE_DIRTY_ATIME) set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state); @@ -1077,7 +1037,7 @@ static int move_expired_inodes(struct list_head *delaying_queue, list_for_each_prev_safe(pos, node, &tmp) { inode = wb_inode(pos); if (inode->i_sb == sb) - list_move(&inode->i_wb_list, dispatch_queue); + list_move(&inode->i_io_list, dispatch_queue); } } out: @@ -1231,10 +1191,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, redirty_tail(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; - inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time); + inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); } else { /* The inode is clean. Remove from writeback lists. */ - inode_wb_list_del_locked(inode, wb); + inode_io_list_del_locked(inode, wb); } } @@ -1377,7 +1337,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * touch it. See comment above for explanation. */ if (!(inode->i_state & I_DIRTY_ALL)) - inode_wb_list_del_locked(inode, wb); + inode_io_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: @@ -1438,7 +1398,9 @@ static long writeback_sb_inodes(struct super_block *sb, unsigned long start_time = jiffies; long write_chunk; long wrote = 0; /* count both pages and inodes */ + struct blk_plug plug; + blk_start_plug(&plug); while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); @@ -1536,6 +1498,7 @@ static long writeback_sb_inodes(struct super_block *sb, break; } } + blk_finish_plug(&plug); return wrote; } @@ -1656,14 +1619,14 @@ static long wb_writeback(struct bdi_writeback *wb, } else if (work->for_background) oldest_jif = jiffies; - trace_writeback_start(wb->bdi, work); + trace_writeback_start(wb, work); if (list_empty(&wb->b_io)) queue_io(wb, work); if (work->sb) progress = writeback_sb_inodes(work->sb, wb, work); else progress = __writeback_inodes_wb(wb, work); - trace_writeback_written(wb->bdi, work); + trace_writeback_written(wb, work); wb_update_bandwidth(wb, wb_start); @@ -1688,7 +1651,7 @@ static long wb_writeback(struct bdi_writeback *wb, * we'll just busyloop. */ if (!list_empty(&wb->b_more_io)) { - trace_writeback_wait(wb->bdi, work); + trace_writeback_wait(wb, work); inode = wb_inode(wb->b_more_io.prev); spin_lock(&inode->i_lock); spin_unlock(&wb->list_lock); @@ -1793,26 +1756,14 @@ static long wb_do_writeback(struct bdi_writeback *wb) set_bit(WB_writeback_running, &wb->state); while ((work = get_next_work_item(wb)) != NULL) { struct wb_completion *done = work->done; - bool need_wake_up = false; - trace_writeback_exec(wb->bdi, work); + trace_writeback_exec(wb, work); wrote += wb_writeback(wb, work); - if (work->single_wait) { - WARN_ON_ONCE(work->auto_free); - /* paired w/ rmb in wb_wait_for_single_work() */ - smp_wmb(); - work->single_done = 1; - need_wake_up = true; - } else if (work->auto_free) { + if (work->auto_free) kfree(work); - } - if (done && atomic_dec_and_test(&done->cnt)) - need_wake_up = true; - - if (need_wake_up) wake_up_all(&wb->bdi->wb_waitq); } @@ -2087,7 +2038,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) else dirty_list = &wb->b_dirty_time; - wakeup_bdi = inode_wb_list_move_locked(inode, wb, + wakeup_bdi = inode_io_list_move_locked(inode, wb, dirty_list); spin_unlock(&wb->list_lock); @@ -2110,6 +2061,15 @@ out_unlock_inode: } EXPORT_SYMBOL(__mark_inode_dirty); +/* + * The @s_sync_lock is used to serialise concurrent sync operations + * to avoid lock contention problems with concurrent wait_sb_inodes() calls. + * Concurrent callers will block on the s_sync_lock rather than doing contending + * walks. The queueing maintains sync(2) required behaviour as all the IO that + * has been issued up to the time this function is enter is guaranteed to be + * completed by the time we have gained the lock and waited for all IO that is + * in progress regardless of the order callers are granted the lock. + */ static void wait_sb_inodes(struct super_block *sb) { struct inode *inode, *old_inode = NULL; @@ -2120,7 +2080,8 @@ static void wait_sb_inodes(struct super_block *sb) */ WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&inode_sb_list_lock); + mutex_lock(&sb->s_sync_lock); + spin_lock(&sb->s_inode_list_lock); /* * Data integrity sync. Must wait for all pages under writeback, @@ -2140,14 +2101,14 @@ static void wait_sb_inodes(struct super_block *sb) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the - * inode_sb_list_lock. We cannot iput the inode now as we can + * s_inode_list_lock. We cannot iput the inode now as we can * be holding the last reference and we cannot iput it under - * inode_sb_list_lock. So we keep the reference and iput it + * s_inode_list_lock. So we keep the reference and iput it * later. */ iput(old_inode); @@ -2157,10 +2118,11 @@ static void wait_sb_inodes(struct super_block *sb) cond_resched(); - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); iput(old_inode); + mutex_unlock(&sb->s_sync_lock); } static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, @@ -2274,8 +2236,12 @@ void sync_inodes_sb(struct super_block *sb) }; struct backing_dev_info *bdi = sb->s_bdi; - /* Nothing to do? */ - if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) + /* + * Can't skip on !bdi_has_dirty() because we should wait for !dirty + * inodes under writeback and I_DIRTY_TIME inodes ignored by + * bdi_has_dirty() need to be written out too. + */ + if (bdi == &noop_backing_dev_info) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 89acec742e0b..d403c69bee08 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -327,7 +327,8 @@ static int fscache_alloc_object(struct fscache_cache *cache, object_already_extant: ret = -ENOBUFS; - if (fscache_object_is_dead(object)) { + if (fscache_object_is_dying(object) || + fscache_cache_is_broken(object)) { spin_unlock(&cookie->lock); goto error; } @@ -671,7 +672,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) if (!op) return -ENOMEM; - fscache_operation_init(op, NULL, NULL); + fscache_operation_init(op, NULL, NULL, NULL); op->flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING) | (1 << FSCACHE_OP_UNUSE_COOKIE); @@ -695,8 +696,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) /* the work queue now carries its own ref on the object */ spin_unlock(&cookie->lock); - ret = fscache_wait_for_operation_activation(object, op, - NULL, NULL, NULL); + ret = fscache_wait_for_operation_activation(object, op, NULL, NULL); if (ret == 0) { /* ask the cache to honour the operation */ ret = object->cache->ops->check_consistency(op); diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 7872a62ef30c..97ec45110957 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -124,8 +124,7 @@ extern int fscache_submit_exclusive_op(struct fscache_object *, struct fscache_operation *); extern int fscache_submit_op(struct fscache_object *, struct fscache_operation *); -extern int fscache_cancel_op(struct fscache_operation *, - void (*)(struct fscache_operation *)); +extern int fscache_cancel_op(struct fscache_operation *, bool); extern void fscache_cancel_all_ops(struct fscache_object *); extern void fscache_abort_object(struct fscache_object *); extern void fscache_start_operations(struct fscache_object *); @@ -138,8 +137,7 @@ extern int fscache_wait_for_deferred_lookup(struct fscache_cookie *); extern int fscache_wait_for_operation_activation(struct fscache_object *, struct fscache_operation *, atomic_t *, - atomic_t *, - void (*)(struct fscache_operation *)); + atomic_t *); extern void fscache_invalidate_writes(struct fscache_cookie *); /* @@ -164,6 +162,7 @@ extern atomic_t fscache_n_op_pend; extern atomic_t fscache_n_op_run; extern atomic_t fscache_n_op_enqueue; extern atomic_t fscache_n_op_deferred_release; +extern atomic_t fscache_n_op_initialised; extern atomic_t fscache_n_op_release; extern atomic_t fscache_n_op_gc; extern atomic_t fscache_n_op_cancelled; @@ -271,6 +270,11 @@ extern atomic_t fscache_n_cop_write_page; extern atomic_t fscache_n_cop_uncache_page; extern atomic_t fscache_n_cop_dissociate_pages; +extern atomic_t fscache_n_cache_no_space_reject; +extern atomic_t fscache_n_cache_stale_objects; +extern atomic_t fscache_n_cache_retired_objects; +extern atomic_t fscache_n_cache_culled_objects; + static inline void fscache_stat(atomic_t *stat) { atomic_inc(stat); diff --git a/fs/fscache/object.c b/fs/fscache/object.c index da032daf0e0d..9e792e30f4db 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -328,6 +328,17 @@ void fscache_object_init(struct fscache_object *object, EXPORT_SYMBOL(fscache_object_init); /* + * Mark the object as no longer being live, making sure that we synchronise + * against op submission. + */ +static inline void fscache_mark_object_dead(struct fscache_object *object) +{ + spin_lock(&object->lock); + clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + spin_unlock(&object->lock); +} + +/* * Abort object initialisation before we start it. */ static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object, @@ -610,6 +621,8 @@ static const struct fscache_state *fscache_lookup_failure(struct fscache_object object->cache->ops->lookup_complete(object); fscache_stat_d(&fscache_n_cop_lookup_complete); + set_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->flags); + cookie = object->cookie; set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) @@ -629,7 +642,7 @@ static const struct fscache_state *fscache_kill_object(struct fscache_object *ob _enter("{OBJ%x,%d,%d},%d", object->debug_id, object->n_ops, object->n_children, event); - clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + fscache_mark_object_dead(object); object->oob_event_mask = 0; if (list_empty(&object->dependents) && @@ -948,7 +961,8 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj if (!op) goto nomem; - fscache_operation_init(op, object->cache->ops->invalidate_object, NULL); + fscache_operation_init(op, object->cache->ops->invalidate_object, + NULL, NULL); op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE) | (1 << FSCACHE_OP_UNUSE_COOKIE); @@ -974,13 +988,13 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj return transit_to(UPDATE_OBJECT); nomem: - clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + fscache_mark_object_dead(object); fscache_unuse_cookie(object); _leave(" [ENOMEM]"); return transit_to(KILL_OBJECT); submit_op_failed: - clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + fscache_mark_object_dead(object); spin_unlock(&cookie->lock); fscache_unuse_cookie(object); kfree(op); @@ -1016,3 +1030,50 @@ static const struct fscache_state *fscache_update_object(struct fscache_object * _leave(""); return transit_to(WAIT_FOR_CMD); } + +/** + * fscache_object_retrying_stale - Note retrying stale object + * @object: The object that will be retried + * + * Note that an object lookup found an on-disk object that was adjudged to be + * stale and has been deleted. The lookup will be retried. + */ +void fscache_object_retrying_stale(struct fscache_object *object) +{ + fscache_stat(&fscache_n_cache_no_space_reject); +} +EXPORT_SYMBOL(fscache_object_retrying_stale); + +/** + * fscache_object_mark_killed - Note that an object was killed + * @object: The object that was culled + * @why: The reason the object was killed. + * + * Note that an object was killed. Returns true if the object was + * already marked killed, false if it wasn't. + */ +void fscache_object_mark_killed(struct fscache_object *object, + enum fscache_why_object_killed why) +{ + if (test_and_set_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->flags)) { + pr_err("Error: Object already killed by cache [%s]\n", + object->cache->identifier); + return; + } + + switch (why) { + case FSCACHE_OBJECT_NO_SPACE: + fscache_stat(&fscache_n_cache_no_space_reject); + break; + case FSCACHE_OBJECT_IS_STALE: + fscache_stat(&fscache_n_cache_stale_objects); + break; + case FSCACHE_OBJECT_WAS_RETIRED: + fscache_stat(&fscache_n_cache_retired_objects); + break; + case FSCACHE_OBJECT_WAS_CULLED: + fscache_stat(&fscache_n_cache_culled_objects); + break; + } +} +EXPORT_SYMBOL(fscache_object_mark_killed); diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index e7b87a0e5185..de67745e1cd7 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -20,6 +20,35 @@ atomic_t fscache_op_debug_id; EXPORT_SYMBOL(fscache_op_debug_id); +static void fscache_operation_dummy_cancel(struct fscache_operation *op) +{ +} + +/** + * fscache_operation_init - Do basic initialisation of an operation + * @op: The operation to initialise + * @release: The release function to assign + * + * Do basic initialisation of an operation. The caller must still set flags, + * object and processor if needed. + */ +void fscache_operation_init(struct fscache_operation *op, + fscache_operation_processor_t processor, + fscache_operation_cancel_t cancel, + fscache_operation_release_t release) +{ + INIT_WORK(&op->work, fscache_op_work_func); + atomic_set(&op->usage, 1); + op->state = FSCACHE_OP_ST_INITIALISED; + op->debug_id = atomic_inc_return(&fscache_op_debug_id); + op->processor = processor; + op->cancel = cancel ?: fscache_operation_dummy_cancel; + op->release = release; + INIT_LIST_HEAD(&op->pend_link); + fscache_stat(&fscache_n_op_initialised); +} +EXPORT_SYMBOL(fscache_operation_init); + /** * fscache_enqueue_operation - Enqueue an operation for processing * @op: The operation to enqueue @@ -76,6 +105,43 @@ static void fscache_run_op(struct fscache_object *object, } /* + * report an unexpected submission + */ +static void fscache_report_unexpected_submission(struct fscache_object *object, + struct fscache_operation *op, + const struct fscache_state *ostate) +{ + static bool once_only; + struct fscache_operation *p; + unsigned n; + + if (once_only) + return; + once_only = true; + + kdebug("unexpected submission OP%x [OBJ%x %s]", + op->debug_id, object->debug_id, object->state->name); + kdebug("objstate=%s [%s]", object->state->name, ostate->name); + kdebug("objflags=%lx", object->flags); + kdebug("objevent=%lx [%lx]", object->events, object->event_mask); + kdebug("ops=%u inp=%u exc=%u", + object->n_ops, object->n_in_progress, object->n_exclusive); + + if (!list_empty(&object->pending_ops)) { + n = 0; + list_for_each_entry(p, &object->pending_ops, pend_link) { + ASSERTCMP(p->object, ==, object); + kdebug("%p %p", op->processor, op->release); + n++; + } + + kdebug("n=%u", n); + } + + dump_stack(); +} + +/* * submit an exclusive operation for an object * - other ops are excluded from running simultaneously with this one * - this gets any extra refs it needs on an op @@ -83,6 +149,8 @@ static void fscache_run_op(struct fscache_object *object, int fscache_submit_exclusive_op(struct fscache_object *object, struct fscache_operation *op) { + const struct fscache_state *ostate; + unsigned long flags; int ret; _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); @@ -95,8 +163,21 @@ int fscache_submit_exclusive_op(struct fscache_object *object, ASSERTCMP(object->n_ops, >=, object->n_exclusive); ASSERT(list_empty(&op->pend_link)); + ostate = object->state; + smp_rmb(); + op->state = FSCACHE_OP_ST_PENDING; - if (fscache_object_is_active(object)) { + flags = READ_ONCE(object->flags); + if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) { + fscache_stat(&fscache_n_op_rejected); + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; + } else if (unlikely(fscache_cache_is_broken(object))) { + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -EIO; + } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) { op->object = object; object->n_ops++; object->n_exclusive++; /* reads and writes must wait */ @@ -118,7 +199,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object, /* need to issue a new write op after this */ clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); ret = 0; - } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { + } else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) { op->object = object; object->n_ops++; object->n_exclusive++; /* reads and writes must wait */ @@ -126,12 +207,15 @@ int fscache_submit_exclusive_op(struct fscache_object *object, list_add_tail(&op->pend_link, &object->pending_ops); fscache_stat(&fscache_n_op_pend); ret = 0; + } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) { + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; } else { - /* If we're in any other state, there must have been an I/O - * error of some nature. - */ - ASSERT(test_bit(FSCACHE_IOERROR, &object->cache->flags)); - ret = -EIO; + fscache_report_unexpected_submission(object, op, ostate); + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; } spin_unlock(&object->lock); @@ -139,43 +223,6 @@ int fscache_submit_exclusive_op(struct fscache_object *object, } /* - * report an unexpected submission - */ -static void fscache_report_unexpected_submission(struct fscache_object *object, - struct fscache_operation *op, - const struct fscache_state *ostate) -{ - static bool once_only; - struct fscache_operation *p; - unsigned n; - - if (once_only) - return; - once_only = true; - - kdebug("unexpected submission OP%x [OBJ%x %s]", - op->debug_id, object->debug_id, object->state->name); - kdebug("objstate=%s [%s]", object->state->name, ostate->name); - kdebug("objflags=%lx", object->flags); - kdebug("objevent=%lx [%lx]", object->events, object->event_mask); - kdebug("ops=%u inp=%u exc=%u", - object->n_ops, object->n_in_progress, object->n_exclusive); - - if (!list_empty(&object->pending_ops)) { - n = 0; - list_for_each_entry(p, &object->pending_ops, pend_link) { - ASSERTCMP(p->object, ==, object); - kdebug("%p %p", op->processor, op->release); - n++; - } - - kdebug("n=%u", n); - } - - dump_stack(); -} - -/* * submit an operation for an object * - objects may be submitted only in the following states: * - during object creation (write ops may be submitted) @@ -187,6 +234,7 @@ int fscache_submit_op(struct fscache_object *object, struct fscache_operation *op) { const struct fscache_state *ostate; + unsigned long flags; int ret; _enter("{OBJ%x OP%x},{%u}", @@ -204,7 +252,17 @@ int fscache_submit_op(struct fscache_object *object, smp_rmb(); op->state = FSCACHE_OP_ST_PENDING; - if (fscache_object_is_active(object)) { + flags = READ_ONCE(object->flags); + if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) { + fscache_stat(&fscache_n_op_rejected); + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; + } else if (unlikely(fscache_cache_is_broken(object))) { + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -EIO; + } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) { op->object = object; object->n_ops++; @@ -222,23 +280,21 @@ int fscache_submit_op(struct fscache_object *object, fscache_run_op(object, op); } ret = 0; - } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { + } else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) { op->object = object; object->n_ops++; atomic_inc(&op->usage); list_add_tail(&op->pend_link, &object->pending_ops); fscache_stat(&fscache_n_op_pend); ret = 0; - } else if (fscache_object_is_dying(object)) { - fscache_stat(&fscache_n_op_rejected); + } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) { + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; - } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) { + } else { fscache_report_unexpected_submission(object, op, ostate); ASSERT(!fscache_object_is_active(object)); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -ENOBUFS; - } else { + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; } @@ -293,9 +349,10 @@ void fscache_start_operations(struct fscache_object *object) * cancel an operation that's pending on an object */ int fscache_cancel_op(struct fscache_operation *op, - void (*do_cancel)(struct fscache_operation *)) + bool cancel_in_progress_op) { struct fscache_object *object = op->object; + bool put = false; int ret; _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id); @@ -309,19 +366,37 @@ int fscache_cancel_op(struct fscache_operation *op, ret = -EBUSY; if (op->state == FSCACHE_OP_ST_PENDING) { ASSERT(!list_empty(&op->pend_link)); - fscache_stat(&fscache_n_op_cancelled); list_del_init(&op->pend_link); - if (do_cancel) - do_cancel(op); + put = true; + + fscache_stat(&fscache_n_op_cancelled); + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) + object->n_exclusive--; + if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) + wake_up_bit(&op->flags, FSCACHE_OP_WAITING); + ret = 0; + } else if (op->state == FSCACHE_OP_ST_IN_PROGRESS && cancel_in_progress_op) { + ASSERTCMP(object->n_in_progress, >, 0); + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) + object->n_exclusive--; + object->n_in_progress--; + if (object->n_in_progress == 0) + fscache_start_operations(object); + + fscache_stat(&fscache_n_op_cancelled); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) object->n_exclusive--; if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) wake_up_bit(&op->flags, FSCACHE_OP_WAITING); - fscache_put_operation(op); ret = 0; } + if (put) + fscache_put_operation(op); spin_unlock(&object->lock); _leave(" = %d", ret); return ret; @@ -345,6 +420,7 @@ void fscache_cancel_all_ops(struct fscache_object *object) list_del_init(&op->pend_link); ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) @@ -377,8 +453,12 @@ void fscache_op_complete(struct fscache_operation *op, bool cancelled) spin_lock(&object->lock); - op->state = cancelled ? - FSCACHE_OP_ST_CANCELLED : FSCACHE_OP_ST_COMPLETE; + if (!cancelled) { + op->state = FSCACHE_OP_ST_COMPLETE; + } else { + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + } if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) object->n_exclusive--; @@ -409,9 +489,9 @@ void fscache_put_operation(struct fscache_operation *op) return; _debug("PUT OP"); - ASSERTIFCMP(op->state != FSCACHE_OP_ST_COMPLETE, + ASSERTIFCMP(op->state != FSCACHE_OP_ST_INITIALISED && + op->state != FSCACHE_OP_ST_COMPLETE, op->state, ==, FSCACHE_OP_ST_CANCELLED); - op->state = FSCACHE_OP_ST_DEAD; fscache_stat(&fscache_n_op_release); @@ -419,37 +499,39 @@ void fscache_put_operation(struct fscache_operation *op) op->release(op); op->release = NULL; } + op->state = FSCACHE_OP_ST_DEAD; object = op->object; + if (likely(object)) { + if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) + atomic_dec(&object->n_reads); + if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags)) + fscache_unuse_cookie(object); + + /* now... we may get called with the object spinlock held, so we + * complete the cleanup here only if we can immediately acquire the + * lock, and defer it otherwise */ + if (!spin_trylock(&object->lock)) { + _debug("defer put"); + fscache_stat(&fscache_n_op_deferred_release); + + cache = object->cache; + spin_lock(&cache->op_gc_list_lock); + list_add_tail(&op->pend_link, &cache->op_gc_list); + spin_unlock(&cache->op_gc_list_lock); + schedule_work(&cache->op_gc); + _leave(" [defer]"); + return; + } - if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) - atomic_dec(&object->n_reads); - if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags)) - fscache_unuse_cookie(object); - - /* now... we may get called with the object spinlock held, so we - * complete the cleanup here only if we can immediately acquire the - * lock, and defer it otherwise */ - if (!spin_trylock(&object->lock)) { - _debug("defer put"); - fscache_stat(&fscache_n_op_deferred_release); + ASSERTCMP(object->n_ops, >, 0); + object->n_ops--; + if (object->n_ops == 0) + fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); - cache = object->cache; - spin_lock(&cache->op_gc_list_lock); - list_add_tail(&op->pend_link, &cache->op_gc_list); - spin_unlock(&cache->op_gc_list_lock); - schedule_work(&cache->op_gc); - _leave(" [defer]"); - return; + spin_unlock(&object->lock); } - ASSERTCMP(object->n_ops, >, 0); - object->n_ops--; - if (object->n_ops == 0) - fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); - - spin_unlock(&object->lock); - kfree(op); _leave(" [done]"); } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index de33b3fccca6..483bbc613bf0 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -213,7 +213,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) return -ENOMEM; } - fscache_operation_init(op, fscache_attr_changed_op, NULL); + fscache_operation_init(op, fscache_attr_changed_op, NULL, NULL); op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE) | (1 << FSCACHE_OP_UNUSE_COOKIE); @@ -239,7 +239,7 @@ nobufs_dec: wake_cookie = __fscache_unuse_cookie(cookie); nobufs: spin_unlock(&cookie->lock); - kfree(op); + fscache_put_operation(op); if (wake_cookie) __fscache_wake_unused_cookie(cookie); fscache_stat(&fscache_n_attr_changed_nobufs); @@ -249,6 +249,17 @@ nobufs: EXPORT_SYMBOL(__fscache_attr_changed); /* + * Handle cancellation of a pending retrieval op + */ +static void fscache_do_cancel_retrieval(struct fscache_operation *_op) +{ + struct fscache_retrieval *op = + container_of(_op, struct fscache_retrieval, op); + + atomic_set(&op->n_pages, 0); +} + +/* * release a retrieval op reference */ static void fscache_release_retrieval_op(struct fscache_operation *_op) @@ -258,11 +269,12 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op) _enter("{OP%x}", op->op.debug_id); - ASSERTCMP(atomic_read(&op->n_pages), ==, 0); + ASSERTIFCMP(op->op.state != FSCACHE_OP_ST_INITIALISED, + atomic_read(&op->n_pages), ==, 0); fscache_hist(fscache_retrieval_histogram, op->start_time); if (op->context) - fscache_put_context(op->op.object->cookie, op->context); + fscache_put_context(op->cookie, op->context); _leave(""); } @@ -285,15 +297,24 @@ static struct fscache_retrieval *fscache_alloc_retrieval( return NULL; } - fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op); + fscache_operation_init(&op->op, NULL, + fscache_do_cancel_retrieval, + fscache_release_retrieval_op); op->op.flags = FSCACHE_OP_MYTHREAD | (1UL << FSCACHE_OP_WAITING) | (1UL << FSCACHE_OP_UNUSE_COOKIE); + op->cookie = cookie; op->mapping = mapping; op->end_io_func = end_io_func; op->context = context; op->start_time = jiffies; INIT_LIST_HEAD(&op->to_do); + + /* Pin the netfs read context in case we need to do the actual netfs + * read because we've encountered a cache read failure. + */ + if (context) + fscache_get_context(op->cookie, context); return op; } @@ -330,24 +351,12 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) } /* - * Handle cancellation of a pending retrieval op - */ -static void fscache_do_cancel_retrieval(struct fscache_operation *_op) -{ - struct fscache_retrieval *op = - container_of(_op, struct fscache_retrieval, op); - - atomic_set(&op->n_pages, 0); -} - -/* * wait for an object to become active (or dead) */ int fscache_wait_for_operation_activation(struct fscache_object *object, struct fscache_operation *op, atomic_t *stat_op_waits, - atomic_t *stat_object_dead, - void (*do_cancel)(struct fscache_operation *)) + atomic_t *stat_object_dead) { int ret; @@ -359,7 +368,7 @@ int fscache_wait_for_operation_activation(struct fscache_object *object, fscache_stat(stat_op_waits); if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING, TASK_INTERRUPTIBLE) != 0) { - ret = fscache_cancel_op(op, do_cancel); + ret = fscache_cancel_op(op, false); if (ret == 0) return -ERESTARTSYS; @@ -377,11 +386,13 @@ check_if_dead: _leave(" = -ENOBUFS [cancelled]"); return -ENOBUFS; } - if (unlikely(fscache_object_is_dead(object))) { - pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->state); - fscache_cancel_op(op, do_cancel); + if (unlikely(fscache_object_is_dying(object) || + fscache_cache_is_broken(object))) { + enum fscache_operation_state state = op->state; + fscache_cancel_op(op, true); if (stat_object_dead) fscache_stat(stat_object_dead); + _leave(" = -ENOBUFS [obj dead %d]", state); return -ENOBUFS; } return 0; @@ -453,17 +464,12 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, fscache_stat(&fscache_n_retrieval_ops); - /* pin the netfs read context in case we need to do the actual netfs - * read because we've encountered a cache read failure */ - fscache_get_context(object->cookie, op->context); - /* we wait for the operation to become active, and then process it * *here*, in this thread, and not in the thread pool */ ret = fscache_wait_for_operation_activation( object, &op->op, __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead), - fscache_do_cancel_retrieval); + __fscache_stat(&fscache_n_retrievals_object_dead)); if (ret < 0) goto error; @@ -503,7 +509,7 @@ nobufs_unlock: spin_unlock(&cookie->lock); if (wake_cookie) __fscache_wake_unused_cookie(cookie); - kfree(op); + fscache_put_retrieval(op); nobufs: fscache_stat(&fscache_n_retrievals_nobufs); _leave(" = -ENOBUFS"); @@ -584,17 +590,12 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, fscache_stat(&fscache_n_retrieval_ops); - /* pin the netfs read context in case we need to do the actual netfs - * read because we've encountered a cache read failure */ - fscache_get_context(object->cookie, op->context); - /* we wait for the operation to become active, and then process it * *here*, in this thread, and not in the thread pool */ ret = fscache_wait_for_operation_activation( object, &op->op, __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead), - fscache_do_cancel_retrieval); + __fscache_stat(&fscache_n_retrievals_object_dead)); if (ret < 0) goto error; @@ -632,7 +633,7 @@ nobufs_unlock_dec: wake_cookie = __fscache_unuse_cookie(cookie); nobufs_unlock: spin_unlock(&cookie->lock); - kfree(op); + fscache_put_retrieval(op); if (wake_cookie) __fscache_wake_unused_cookie(cookie); nobufs: @@ -700,8 +701,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, ret = fscache_wait_for_operation_activation( object, &op->op, __fscache_stat(&fscache_n_alloc_op_waits), - __fscache_stat(&fscache_n_allocs_object_dead), - fscache_do_cancel_retrieval); + __fscache_stat(&fscache_n_allocs_object_dead)); if (ret < 0) goto error; @@ -726,7 +726,7 @@ nobufs_unlock_dec: wake_cookie = __fscache_unuse_cookie(cookie); nobufs_unlock: spin_unlock(&cookie->lock); - kfree(op); + fscache_put_retrieval(op); if (wake_cookie) __fscache_wake_unused_cookie(cookie); nobufs: @@ -944,7 +944,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, if (!op) goto nomem; - fscache_operation_init(&op->op, fscache_write_op, + fscache_operation_init(&op->op, fscache_write_op, NULL, fscache_release_write_op); op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING) | @@ -1016,7 +1016,7 @@ already_pending: spin_unlock(&object->lock); spin_unlock(&cookie->lock); radix_tree_preload_end(); - kfree(op); + fscache_put_operation(&op->op); fscache_stat(&fscache_n_stores_ok); _leave(" = 0"); return 0; @@ -1036,7 +1036,7 @@ nobufs_unlock_obj: nobufs: spin_unlock(&cookie->lock); radix_tree_preload_end(); - kfree(op); + fscache_put_operation(&op->op); if (wake_cookie) __fscache_wake_unused_cookie(cookie); fscache_stat(&fscache_n_stores_nobufs); @@ -1044,7 +1044,7 @@ nobufs: return -ENOBUFS; nomem_free: - kfree(op); + fscache_put_operation(&op->op); nomem: fscache_stat(&fscache_n_stores_oom); _leave(" = -ENOMEM"); diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 40d13c70ef51..7cfa0aacdf6d 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -23,6 +23,7 @@ atomic_t fscache_n_op_run; atomic_t fscache_n_op_enqueue; atomic_t fscache_n_op_requeue; atomic_t fscache_n_op_deferred_release; +atomic_t fscache_n_op_initialised; atomic_t fscache_n_op_release; atomic_t fscache_n_op_gc; atomic_t fscache_n_op_cancelled; @@ -130,6 +131,11 @@ atomic_t fscache_n_cop_write_page; atomic_t fscache_n_cop_uncache_page; atomic_t fscache_n_cop_dissociate_pages; +atomic_t fscache_n_cache_no_space_reject; +atomic_t fscache_n_cache_stale_objects; +atomic_t fscache_n_cache_retired_objects; +atomic_t fscache_n_cache_culled_objects; + /* * display the general statistics */ @@ -246,7 +252,8 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_op_enqueue), atomic_read(&fscache_n_op_cancelled), atomic_read(&fscache_n_op_rejected)); - seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n", + seq_printf(m, "Ops : ini=%u dfr=%u rel=%u gc=%u\n", + atomic_read(&fscache_n_op_initialised), atomic_read(&fscache_n_op_deferred_release), atomic_read(&fscache_n_op_release), atomic_read(&fscache_n_op_gc)); @@ -271,6 +278,11 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_cop_write_page), atomic_read(&fscache_n_cop_uncache_page), atomic_read(&fscache_n_cop_dissociate_pages)); + seq_printf(m, "CacheEv: nsp=%d stl=%d rtr=%d cul=%d\n", + atomic_read(&fscache_n_cache_no_space_reject), + atomic_read(&fscache_n_cache_stale_objects), + atomic_read(&fscache_n_cache_retired_objects), + atomic_read(&fscache_n_cache_culled_objects)); return 0; } diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index e5bbf748b698..eae2c11268bc 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -489,6 +489,7 @@ static void cuse_fc_release(struct fuse_conn *fc) */ static int cuse_channel_open(struct inode *inode, struct file *file) { + struct fuse_dev *fud; struct cuse_conn *cc; int rc; @@ -499,17 +500,22 @@ static int cuse_channel_open(struct inode *inode, struct file *file) fuse_conn_init(&cc->fc); + fud = fuse_dev_alloc(&cc->fc); + if (!fud) { + kfree(cc); + return -ENOMEM; + } + INIT_LIST_HEAD(&cc->list); cc->fc.release = cuse_fc_release; - cc->fc.connected = 1; cc->fc.initialized = 1; rc = cuse_send_init(cc); if (rc) { - fuse_conn_put(&cc->fc); + fuse_dev_free(fud); return rc; } - file->private_data = &cc->fc; /* channel owns base reference to cc */ + file->private_data = fud; return 0; } @@ -527,7 +533,8 @@ static int cuse_channel_open(struct inode *inode, struct file *file) */ static int cuse_channel_release(struct inode *inode, struct file *file) { - struct cuse_conn *cc = fc_to_cc(file->private_data); + struct fuse_dev *fud = file->private_data; + struct cuse_conn *cc = fc_to_cc(fud->fc); int rc; /* remove from the conntbl, no more access from this point on */ diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c8b68ab2e574..ebb5e37455a0 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -25,13 +25,13 @@ MODULE_ALIAS("devname:fuse"); static struct kmem_cache *fuse_req_cachep; -static struct fuse_conn *fuse_get_conn(struct file *file) +static struct fuse_dev *fuse_get_dev(struct file *file) { /* * Lockless access is OK, because file->private data is set * once during mount and is valid until the file is released. */ - return file->private_data; + return ACCESS_ONCE(file->private_data); } static void fuse_request_init(struct fuse_req *req, struct page **pages, @@ -48,6 +48,7 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages, req->pages = pages; req->page_descs = page_descs; req->max_pages = npages; + __set_bit(FR_PENDING, &req->flags); } static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) @@ -168,6 +169,10 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, if (!fc->connected) goto out; + err = -ECONNREFUSED; + if (fc->conn_error) + goto out; + req = fuse_request_alloc(npages); err = -ENOMEM; if (!req) { @@ -177,8 +182,10 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, } fuse_req_init_context(req); - req->waiting = 1; - req->background = for_background; + __set_bit(FR_WAITING, &req->flags); + if (for_background) + __set_bit(FR_BACKGROUND, &req->flags); + return req; out: @@ -268,15 +275,15 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, req = get_reserved_req(fc, file); fuse_req_init_context(req); - req->waiting = 1; - req->background = 0; + __set_bit(FR_WAITING, &req->flags); + __clear_bit(FR_BACKGROUND, &req->flags); return req; } void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (atomic_dec_and_test(&req->count)) { - if (unlikely(req->background)) { + if (test_bit(FR_BACKGROUND, &req->flags)) { /* * We get here in the unlikely case that a background * request was allocated but not sent @@ -287,8 +294,10 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) spin_unlock(&fc->lock); } - if (req->waiting) + if (test_bit(FR_WAITING, &req->flags)) { + __clear_bit(FR_WAITING, &req->flags); atomic_dec(&fc->num_waiting); + } if (req->stolen_file) put_reserved_req(fc, req); @@ -309,46 +318,38 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args) return nbytes; } -static u64 fuse_get_unique(struct fuse_conn *fc) +static u64 fuse_get_unique(struct fuse_iqueue *fiq) { - fc->reqctr++; - /* zero is special */ - if (fc->reqctr == 0) - fc->reqctr = 1; - - return fc->reqctr; + return ++fiq->reqctr; } -static void queue_request(struct fuse_conn *fc, struct fuse_req *req) +static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) { req->in.h.len = sizeof(struct fuse_in_header) + len_args(req->in.numargs, (struct fuse_arg *) req->in.args); - list_add_tail(&req->list, &fc->pending); - req->state = FUSE_REQ_PENDING; - if (!req->waiting) { - req->waiting = 1; - atomic_inc(&fc->num_waiting); - } - wake_up(&fc->waitq); - kill_fasync(&fc->fasync, SIGIO, POLL_IN); + list_add_tail(&req->list, &fiq->pending); + wake_up_locked(&fiq->waitq); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); } void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, u64 nodeid, u64 nlookup) { + struct fuse_iqueue *fiq = &fc->iq; + forget->forget_one.nodeid = nodeid; forget->forget_one.nlookup = nlookup; - spin_lock(&fc->lock); - if (fc->connected) { - fc->forget_list_tail->next = forget; - fc->forget_list_tail = forget; - wake_up(&fc->waitq); - kill_fasync(&fc->fasync, SIGIO, POLL_IN); + spin_lock(&fiq->waitq.lock); + if (fiq->connected) { + fiq->forget_list_tail->next = forget; + fiq->forget_list_tail = forget; + wake_up_locked(&fiq->waitq); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); } else { kfree(forget); } - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); } static void flush_bg_queue(struct fuse_conn *fc) @@ -356,12 +357,15 @@ static void flush_bg_queue(struct fuse_conn *fc) while (fc->active_background < fc->max_background && !list_empty(&fc->bg_queue)) { struct fuse_req *req; + struct fuse_iqueue *fiq = &fc->iq; req = list_entry(fc->bg_queue.next, struct fuse_req, list); list_del(&req->list); fc->active_background++; - req->in.h.unique = fuse_get_unique(fc); - queue_request(fc, req); + spin_lock(&fiq->waitq.lock); + req->in.h.unique = fuse_get_unique(fiq); + queue_request(fiq, req); + spin_unlock(&fiq->waitq.lock); } } @@ -372,20 +376,22 @@ static void flush_bg_queue(struct fuse_conn *fc) * was closed. The requester thread is woken up (if still waiting), * the 'end' callback is called if given, else the reference to the * request is released - * - * Called with fc->lock, unlocks it */ static void request_end(struct fuse_conn *fc, struct fuse_req *req) -__releases(fc->lock) { - void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; - req->end = NULL; - list_del(&req->list); - list_del(&req->intr_entry); - req->state = FUSE_REQ_FINISHED; - if (req->background) { - req->background = 0; + struct fuse_iqueue *fiq = &fc->iq; + + if (test_and_set_bit(FR_FINISHED, &req->flags)) + return; + spin_lock(&fiq->waitq.lock); + list_del_init(&req->intr_entry); + spin_unlock(&fiq->waitq.lock); + WARN_ON(test_bit(FR_PENDING, &req->flags)); + WARN_ON(test_bit(FR_SENT, &req->flags)); + if (test_bit(FR_BACKGROUND, &req->flags)) { + spin_lock(&fc->lock); + clear_bit(FR_BACKGROUND, &req->flags); if (fc->num_background == fc->max_background) fc->blocked = 0; @@ -401,122 +407,105 @@ __releases(fc->lock) fc->num_background--; fc->active_background--; flush_bg_queue(fc); + spin_unlock(&fc->lock); } - spin_unlock(&fc->lock); wake_up(&req->waitq); - if (end) - end(fc, req); + if (req->end) + req->end(fc, req); fuse_put_request(fc, req); } -static void wait_answer_interruptible(struct fuse_conn *fc, - struct fuse_req *req) -__releases(fc->lock) -__acquires(fc->lock) -{ - if (signal_pending(current)) - return; - - spin_unlock(&fc->lock); - wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED); - spin_lock(&fc->lock); -} - -static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req) +static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { - list_add_tail(&req->intr_entry, &fc->interrupts); - wake_up(&fc->waitq); - kill_fasync(&fc->fasync, SIGIO, POLL_IN); + spin_lock(&fiq->waitq.lock); + if (list_empty(&req->intr_entry)) { + list_add_tail(&req->intr_entry, &fiq->interrupts); + wake_up_locked(&fiq->waitq); + } + spin_unlock(&fiq->waitq.lock); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); } static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) -__releases(fc->lock) -__acquires(fc->lock) { + struct fuse_iqueue *fiq = &fc->iq; + int err; + if (!fc->no_interrupt) { /* Any signal may interrupt this */ - wait_answer_interruptible(fc, req); - - if (req->aborted) - goto aborted; - if (req->state == FUSE_REQ_FINISHED) + err = wait_event_interruptible(req->waitq, + test_bit(FR_FINISHED, &req->flags)); + if (!err) return; - req->interrupted = 1; - if (req->state == FUSE_REQ_SENT) - queue_interrupt(fc, req); + set_bit(FR_INTERRUPTED, &req->flags); + /* matches barrier in fuse_dev_do_read() */ + smp_mb__after_atomic(); + if (test_bit(FR_SENT, &req->flags)) + queue_interrupt(fiq, req); } - if (!req->force) { + if (!test_bit(FR_FORCE, &req->flags)) { sigset_t oldset; /* Only fatal signals may interrupt this */ block_sigs(&oldset); - wait_answer_interruptible(fc, req); + err = wait_event_interruptible(req->waitq, + test_bit(FR_FINISHED, &req->flags)); restore_sigs(&oldset); - if (req->aborted) - goto aborted; - if (req->state == FUSE_REQ_FINISHED) + if (!err) return; + spin_lock(&fiq->waitq.lock); /* Request is not yet in userspace, bail out */ - if (req->state == FUSE_REQ_PENDING) { + if (test_bit(FR_PENDING, &req->flags)) { list_del(&req->list); + spin_unlock(&fiq->waitq.lock); __fuse_put_request(req); req->out.h.error = -EINTR; return; } + spin_unlock(&fiq->waitq.lock); } /* * Either request is already in userspace, or it was forced. * Wait it out. */ - spin_unlock(&fc->lock); - wait_event(req->waitq, req->state == FUSE_REQ_FINISHED); - spin_lock(&fc->lock); - - if (!req->aborted) - return; - - aborted: - BUG_ON(req->state != FUSE_REQ_FINISHED); - if (req->locked) { - /* This is uninterruptible sleep, because data is - being copied to/from the buffers of req. During - locked state, there mustn't be any filesystem - operation (e.g. page fault), since that could lead - to deadlock */ - spin_unlock(&fc->lock); - wait_event(req->waitq, !req->locked); - spin_lock(&fc->lock); - } + wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags)); } static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) { - BUG_ON(req->background); - spin_lock(&fc->lock); - if (!fc->connected) + struct fuse_iqueue *fiq = &fc->iq; + + BUG_ON(test_bit(FR_BACKGROUND, &req->flags)); + spin_lock(&fiq->waitq.lock); + if (!fiq->connected) { + spin_unlock(&fiq->waitq.lock); req->out.h.error = -ENOTCONN; - else if (fc->conn_error) - req->out.h.error = -ECONNREFUSED; - else { - req->in.h.unique = fuse_get_unique(fc); - queue_request(fc, req); + } else { + req->in.h.unique = fuse_get_unique(fiq); + queue_request(fiq, req); /* acquire extra reference, since request is still needed after request_end() */ __fuse_get_request(req); + spin_unlock(&fiq->waitq.lock); request_wait_answer(fc, req); + /* Pairs with smp_wmb() in request_end() */ + smp_rmb(); } - spin_unlock(&fc->lock); } void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) { - req->isreply = 1; + __set_bit(FR_ISREPLY, &req->flags); + if (!test_bit(FR_WAITING, &req->flags)) { + __set_bit(FR_WAITING, &req->flags); + atomic_inc(&fc->num_waiting); + } __fuse_request_send(fc, req); } EXPORT_SYMBOL_GPL(fuse_request_send); @@ -586,10 +575,20 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) return ret; } -static void fuse_request_send_nowait_locked(struct fuse_conn *fc, - struct fuse_req *req) +/* + * Called under fc->lock + * + * fc->connected must have been checked previously + */ +void fuse_request_send_background_locked(struct fuse_conn *fc, + struct fuse_req *req) { - BUG_ON(!req->background); + BUG_ON(!test_bit(FR_BACKGROUND, &req->flags)); + if (!test_bit(FR_WAITING, &req->flags)) { + __set_bit(FR_WAITING, &req->flags); + atomic_inc(&fc->num_waiting); + } + __set_bit(FR_ISREPLY, &req->flags); fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; @@ -602,54 +601,40 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc, flush_bg_queue(fc); } -static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) +void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) { + BUG_ON(!req->end); spin_lock(&fc->lock); if (fc->connected) { - fuse_request_send_nowait_locked(fc, req); + fuse_request_send_background_locked(fc, req); spin_unlock(&fc->lock); } else { + spin_unlock(&fc->lock); req->out.h.error = -ENOTCONN; - request_end(fc, req); + req->end(fc, req); + fuse_put_request(fc, req); } } - -void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) -{ - req->isreply = 1; - fuse_request_send_nowait(fc, req); -} EXPORT_SYMBOL_GPL(fuse_request_send_background); static int fuse_request_send_notify_reply(struct fuse_conn *fc, struct fuse_req *req, u64 unique) { int err = -ENODEV; + struct fuse_iqueue *fiq = &fc->iq; - req->isreply = 0; + __clear_bit(FR_ISREPLY, &req->flags); req->in.h.unique = unique; - spin_lock(&fc->lock); - if (fc->connected) { - queue_request(fc, req); + spin_lock(&fiq->waitq.lock); + if (fiq->connected) { + queue_request(fiq, req); err = 0; } - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); return err; } -/* - * Called under fc->lock - * - * fc->connected must have been checked previously - */ -void fuse_request_send_background_locked(struct fuse_conn *fc, - struct fuse_req *req) -{ - req->isreply = 1; - fuse_request_send_nowait_locked(fc, req); -} - void fuse_force_forget(struct file *file, u64 nodeid) { struct inode *inode = file_inode(file); @@ -665,7 +650,7 @@ void fuse_force_forget(struct file *file, u64 nodeid) req->in.numargs = 1; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; - req->isreply = 0; + __clear_bit(FR_ISREPLY, &req->flags); __fuse_request_send(fc, req); /* ignore errors */ fuse_put_request(fc, req); @@ -676,38 +661,39 @@ void fuse_force_forget(struct file *file, u64 nodeid) * anything that could cause a page-fault. If the request was already * aborted bail out. */ -static int lock_request(struct fuse_conn *fc, struct fuse_req *req) +static int lock_request(struct fuse_req *req) { int err = 0; if (req) { - spin_lock(&fc->lock); - if (req->aborted) + spin_lock(&req->waitq.lock); + if (test_bit(FR_ABORTED, &req->flags)) err = -ENOENT; else - req->locked = 1; - spin_unlock(&fc->lock); + set_bit(FR_LOCKED, &req->flags); + spin_unlock(&req->waitq.lock); } return err; } /* - * Unlock request. If it was aborted during being locked, the - * requester thread is currently waiting for it to be unlocked, so - * wake it up. + * Unlock request. If it was aborted while locked, caller is responsible + * for unlocking and ending the request. */ -static void unlock_request(struct fuse_conn *fc, struct fuse_req *req) +static int unlock_request(struct fuse_req *req) { + int err = 0; if (req) { - spin_lock(&fc->lock); - req->locked = 0; - if (req->aborted) - wake_up(&req->waitq); - spin_unlock(&fc->lock); + spin_lock(&req->waitq.lock); + if (test_bit(FR_ABORTED, &req->flags)) + err = -ENOENT; + else + clear_bit(FR_LOCKED, &req->flags); + spin_unlock(&req->waitq.lock); } + return err; } struct fuse_copy_state { - struct fuse_conn *fc; int write; struct fuse_req *req; struct iov_iter *iter; @@ -721,13 +707,10 @@ struct fuse_copy_state { unsigned move_pages:1; }; -static void fuse_copy_init(struct fuse_copy_state *cs, - struct fuse_conn *fc, - int write, +static void fuse_copy_init(struct fuse_copy_state *cs, int write, struct iov_iter *iter) { memset(cs, 0, sizeof(*cs)); - cs->fc = fc; cs->write = write; cs->iter = iter; } @@ -760,7 +743,10 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) struct page *page; int err; - unlock_request(cs->fc, cs->req); + err = unlock_request(cs->req); + if (err) + return err; + fuse_copy_finish(cs); if (cs->pipebufs) { struct pipe_buffer *buf = cs->pipebufs; @@ -809,7 +795,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) iov_iter_advance(cs->iter, err); } - return lock_request(cs->fc, cs->req); + return lock_request(cs->req); } /* Do as much copy to/from userspace buffer as we can */ @@ -860,7 +846,10 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) struct page *newpage; struct pipe_buffer *buf = cs->pipebufs; - unlock_request(cs->fc, cs->req); + err = unlock_request(cs->req); + if (err) + return err; + fuse_copy_finish(cs); err = buf->ops->confirm(cs->pipe, buf); @@ -914,12 +903,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) lru_cache_add_file(newpage); err = 0; - spin_lock(&cs->fc->lock); - if (cs->req->aborted) + spin_lock(&cs->req->waitq.lock); + if (test_bit(FR_ABORTED, &cs->req->flags)) err = -ENOENT; else *pagep = newpage; - spin_unlock(&cs->fc->lock); + spin_unlock(&cs->req->waitq.lock); if (err) { unlock_page(newpage); @@ -939,7 +928,7 @@ out_fallback: cs->pg = buf->page; cs->offset = buf->offset; - err = lock_request(cs->fc, cs->req); + err = lock_request(cs->req); if (err) return err; @@ -950,11 +939,15 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, unsigned offset, unsigned count) { struct pipe_buffer *buf; + int err; if (cs->nr_segs == cs->pipe->buffers) return -EIO; - unlock_request(cs->fc, cs->req); + err = unlock_request(cs->req); + if (err) + return err; + fuse_copy_finish(cs); buf = cs->pipebufs; @@ -1065,36 +1058,15 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, return err; } -static int forget_pending(struct fuse_conn *fc) +static int forget_pending(struct fuse_iqueue *fiq) { - return fc->forget_list_head.next != NULL; + return fiq->forget_list_head.next != NULL; } -static int request_pending(struct fuse_conn *fc) +static int request_pending(struct fuse_iqueue *fiq) { - return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) || - forget_pending(fc); -} - -/* Wait until a request is available on the pending list */ -static void request_wait(struct fuse_conn *fc) -__releases(fc->lock) -__acquires(fc->lock) -{ - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue_exclusive(&fc->waitq, &wait); - while (fc->connected && !request_pending(fc)) { - set_current_state(TASK_INTERRUPTIBLE); - if (signal_pending(current)) - break; - - spin_unlock(&fc->lock); - schedule(); - spin_lock(&fc->lock); - } - set_current_state(TASK_RUNNING); - remove_wait_queue(&fc->waitq, &wait); + return !list_empty(&fiq->pending) || !list_empty(&fiq->interrupts) || + forget_pending(fiq); } /* @@ -1103,11 +1075,12 @@ __acquires(fc->lock) * Unlike other requests this is assembled on demand, without a need * to allocate a separate fuse_req structure. * - * Called with fc->lock held, releases it + * Called with fiq->waitq.lock held, releases it */ -static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs, +static int fuse_read_interrupt(struct fuse_iqueue *fiq, + struct fuse_copy_state *cs, size_t nbytes, struct fuse_req *req) -__releases(fc->lock) +__releases(fiq->waitq.lock) { struct fuse_in_header ih; struct fuse_interrupt_in arg; @@ -1115,7 +1088,7 @@ __releases(fc->lock) int err; list_del_init(&req->intr_entry); - req->intr_unique = fuse_get_unique(fc); + req->intr_unique = fuse_get_unique(fiq); memset(&ih, 0, sizeof(ih)); memset(&arg, 0, sizeof(arg)); ih.len = reqsize; @@ -1123,7 +1096,7 @@ __releases(fc->lock) ih.unique = req->intr_unique; arg.unique = req->in.h.unique; - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); if (nbytes < reqsize) return -EINVAL; @@ -1135,21 +1108,21 @@ __releases(fc->lock) return err ? err : reqsize; } -static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc, +static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq, unsigned max, unsigned *countp) { - struct fuse_forget_link *head = fc->forget_list_head.next; + struct fuse_forget_link *head = fiq->forget_list_head.next; struct fuse_forget_link **newhead = &head; unsigned count; for (count = 0; *newhead != NULL && count < max; count++) newhead = &(*newhead)->next; - fc->forget_list_head.next = *newhead; + fiq->forget_list_head.next = *newhead; *newhead = NULL; - if (fc->forget_list_head.next == NULL) - fc->forget_list_tail = &fc->forget_list_head; + if (fiq->forget_list_head.next == NULL) + fiq->forget_list_tail = &fiq->forget_list_head; if (countp != NULL) *countp = count; @@ -1157,24 +1130,24 @@ static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc, return head; } -static int fuse_read_single_forget(struct fuse_conn *fc, +static int fuse_read_single_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) -__releases(fc->lock) +__releases(fiq->waitq.lock) { int err; - struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL); + struct fuse_forget_link *forget = dequeue_forget(fiq, 1, NULL); struct fuse_forget_in arg = { .nlookup = forget->forget_one.nlookup, }; struct fuse_in_header ih = { .opcode = FUSE_FORGET, .nodeid = forget->forget_one.nodeid, - .unique = fuse_get_unique(fc), + .unique = fuse_get_unique(fiq), .len = sizeof(ih) + sizeof(arg), }; - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); kfree(forget); if (nbytes < ih.len) return -EINVAL; @@ -1190,9 +1163,9 @@ __releases(fc->lock) return ih.len; } -static int fuse_read_batch_forget(struct fuse_conn *fc, +static int fuse_read_batch_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) -__releases(fc->lock) +__releases(fiq->waitq.lock) { int err; unsigned max_forgets; @@ -1201,18 +1174,18 @@ __releases(fc->lock) struct fuse_batch_forget_in arg = { .count = 0 }; struct fuse_in_header ih = { .opcode = FUSE_BATCH_FORGET, - .unique = fuse_get_unique(fc), + .unique = fuse_get_unique(fiq), .len = sizeof(ih) + sizeof(arg), }; if (nbytes < ih.len) { - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); return -EINVAL; } max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one); - head = dequeue_forget(fc, max_forgets, &count); - spin_unlock(&fc->lock); + head = dequeue_forget(fiq, max_forgets, &count); + spin_unlock(&fiq->waitq.lock); arg.count = count; ih.len += count * sizeof(struct fuse_forget_one); @@ -1239,14 +1212,15 @@ __releases(fc->lock) return ih.len; } -static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs, +static int fuse_read_forget(struct fuse_conn *fc, struct fuse_iqueue *fiq, + struct fuse_copy_state *cs, size_t nbytes) -__releases(fc->lock) +__releases(fiq->waitq.lock) { - if (fc->minor < 16 || fc->forget_list_head.next->next == NULL) - return fuse_read_single_forget(fc, cs, nbytes); + if (fc->minor < 16 || fiq->forget_list_head.next->next == NULL) + return fuse_read_single_forget(fiq, cs, nbytes); else - return fuse_read_batch_forget(fc, cs, nbytes); + return fuse_read_batch_forget(fiq, cs, nbytes); } /* @@ -1258,46 +1232,51 @@ __releases(fc->lock) * request_end(). Otherwise add it to the processing list, and set * the 'sent' flag. */ -static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file, +static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, struct fuse_copy_state *cs, size_t nbytes) { - int err; + ssize_t err; + struct fuse_conn *fc = fud->fc; + struct fuse_iqueue *fiq = &fc->iq; + struct fuse_pqueue *fpq = &fud->pq; struct fuse_req *req; struct fuse_in *in; unsigned reqsize; restart: - spin_lock(&fc->lock); + spin_lock(&fiq->waitq.lock); err = -EAGAIN; - if ((file->f_flags & O_NONBLOCK) && fc->connected && - !request_pending(fc)) + if ((file->f_flags & O_NONBLOCK) && fiq->connected && + !request_pending(fiq)) goto err_unlock; - request_wait(fc); - err = -ENODEV; - if (!fc->connected) + err = wait_event_interruptible_exclusive_locked(fiq->waitq, + !fiq->connected || request_pending(fiq)); + if (err) goto err_unlock; - err = -ERESTARTSYS; - if (!request_pending(fc)) + + err = -ENODEV; + if (!fiq->connected) goto err_unlock; - if (!list_empty(&fc->interrupts)) { - req = list_entry(fc->interrupts.next, struct fuse_req, + if (!list_empty(&fiq->interrupts)) { + req = list_entry(fiq->interrupts.next, struct fuse_req, intr_entry); - return fuse_read_interrupt(fc, cs, nbytes, req); + return fuse_read_interrupt(fiq, cs, nbytes, req); } - if (forget_pending(fc)) { - if (list_empty(&fc->pending) || fc->forget_batch-- > 0) - return fuse_read_forget(fc, cs, nbytes); + if (forget_pending(fiq)) { + if (list_empty(&fiq->pending) || fiq->forget_batch-- > 0) + return fuse_read_forget(fc, fiq, cs, nbytes); - if (fc->forget_batch <= -8) - fc->forget_batch = 16; + if (fiq->forget_batch <= -8) + fiq->forget_batch = 16; } - req = list_entry(fc->pending.next, struct fuse_req, list); - req->state = FUSE_REQ_READING; - list_move(&req->list, &fc->io); + req = list_entry(fiq->pending.next, struct fuse_req, list); + clear_bit(FR_PENDING, &req->flags); + list_del_init(&req->list); + spin_unlock(&fiq->waitq.lock); in = &req->in; reqsize = in->h.len; @@ -1310,37 +1289,48 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file, request_end(fc, req); goto restart; } - spin_unlock(&fc->lock); + spin_lock(&fpq->lock); + list_add(&req->list, &fpq->io); + spin_unlock(&fpq->lock); cs->req = req; err = fuse_copy_one(cs, &in->h, sizeof(in->h)); if (!err) err = fuse_copy_args(cs, in->numargs, in->argpages, (struct fuse_arg *) in->args, 0); fuse_copy_finish(cs); - spin_lock(&fc->lock); - req->locked = 0; - if (req->aborted) { - request_end(fc, req); - return -ENODEV; + spin_lock(&fpq->lock); + clear_bit(FR_LOCKED, &req->flags); + if (!fpq->connected) { + err = -ENODEV; + goto out_end; } if (err) { req->out.h.error = -EIO; - request_end(fc, req); - return err; + goto out_end; } - if (!req->isreply) - request_end(fc, req); - else { - req->state = FUSE_REQ_SENT; - list_move_tail(&req->list, &fc->processing); - if (req->interrupted) - queue_interrupt(fc, req); - spin_unlock(&fc->lock); + if (!test_bit(FR_ISREPLY, &req->flags)) { + err = reqsize; + goto out_end; } + list_move_tail(&req->list, &fpq->processing); + spin_unlock(&fpq->lock); + set_bit(FR_SENT, &req->flags); + /* matches barrier in request_wait_answer() */ + smp_mb__after_atomic(); + if (test_bit(FR_INTERRUPTED, &req->flags)) + queue_interrupt(fiq, req); + return reqsize; +out_end: + if (!test_bit(FR_PRIVATE, &req->flags)) + list_del_init(&req->list); + spin_unlock(&fpq->lock); + request_end(fc, req); + return err; + err_unlock: - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); return err; } @@ -1359,16 +1349,17 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) { struct fuse_copy_state cs; struct file *file = iocb->ki_filp; - struct fuse_conn *fc = fuse_get_conn(file); - if (!fc) + struct fuse_dev *fud = fuse_get_dev(file); + + if (!fud) return -EPERM; if (!iter_is_iovec(to)) return -EINVAL; - fuse_copy_init(&cs, fc, 1, to); + fuse_copy_init(&cs, 1, to); - return fuse_dev_do_read(fc, file, &cs, iov_iter_count(to)); + return fuse_dev_do_read(fud, file, &cs, iov_iter_count(to)); } static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, @@ -1380,18 +1371,19 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, int do_wakeup = 0; struct pipe_buffer *bufs; struct fuse_copy_state cs; - struct fuse_conn *fc = fuse_get_conn(in); - if (!fc) + struct fuse_dev *fud = fuse_get_dev(in); + + if (!fud) return -EPERM; bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL); if (!bufs) return -ENOMEM; - fuse_copy_init(&cs, fc, 1, NULL); + fuse_copy_init(&cs, 1, NULL); cs.pipebufs = bufs; cs.pipe = pipe; - ret = fuse_dev_do_read(fc, in, &cs, len); + ret = fuse_dev_do_read(fud, in, &cs, len); if (ret < 0) goto out; @@ -1830,11 +1822,11 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, } /* Look up request on processing list by unique ID */ -static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique) +static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) { struct fuse_req *req; - list_for_each_entry(req, &fc->processing, list) { + list_for_each_entry(req, &fpq->processing, list) { if (req->in.h.unique == unique || req->intr_unique == unique) return req; } @@ -1871,10 +1863,12 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, * it from the list and copy the rest of the buffer to the request. * The request is finished by calling request_end() */ -static ssize_t fuse_dev_do_write(struct fuse_conn *fc, +static ssize_t fuse_dev_do_write(struct fuse_dev *fud, struct fuse_copy_state *cs, size_t nbytes) { int err; + struct fuse_conn *fc = fud->fc; + struct fuse_pqueue *fpq = &fud->pq; struct fuse_req *req; struct fuse_out_header oh; @@ -1902,63 +1896,60 @@ static ssize_t fuse_dev_do_write(struct fuse_conn *fc, if (oh.error <= -1000 || oh.error > 0) goto err_finish; - spin_lock(&fc->lock); + spin_lock(&fpq->lock); err = -ENOENT; - if (!fc->connected) - goto err_unlock; + if (!fpq->connected) + goto err_unlock_pq; - req = request_find(fc, oh.unique); + req = request_find(fpq, oh.unique); if (!req) - goto err_unlock; + goto err_unlock_pq; - if (req->aborted) { - spin_unlock(&fc->lock); - fuse_copy_finish(cs); - spin_lock(&fc->lock); - request_end(fc, req); - return -ENOENT; - } /* Is it an interrupt reply? */ if (req->intr_unique == oh.unique) { + spin_unlock(&fpq->lock); + err = -EINVAL; if (nbytes != sizeof(struct fuse_out_header)) - goto err_unlock; + goto err_finish; if (oh.error == -ENOSYS) fc->no_interrupt = 1; else if (oh.error == -EAGAIN) - queue_interrupt(fc, req); + queue_interrupt(&fc->iq, req); - spin_unlock(&fc->lock); fuse_copy_finish(cs); return nbytes; } - req->state = FUSE_REQ_WRITING; - list_move(&req->list, &fc->io); + clear_bit(FR_SENT, &req->flags); + list_move(&req->list, &fpq->io); req->out.h = oh; - req->locked = 1; + set_bit(FR_LOCKED, &req->flags); + spin_unlock(&fpq->lock); cs->req = req; if (!req->out.page_replace) cs->move_pages = 0; - spin_unlock(&fc->lock); err = copy_out_args(cs, &req->out, nbytes); fuse_copy_finish(cs); - spin_lock(&fc->lock); - req->locked = 0; - if (!err) { - if (req->aborted) - err = -ENOENT; - } else if (!req->aborted) + spin_lock(&fpq->lock); + clear_bit(FR_LOCKED, &req->flags); + if (!fpq->connected) + err = -ENOENT; + else if (err) req->out.h.error = -EIO; + if (!test_bit(FR_PRIVATE, &req->flags)) + list_del_init(&req->list); + spin_unlock(&fpq->lock); + request_end(fc, req); return err ? err : nbytes; - err_unlock: - spin_unlock(&fc->lock); + err_unlock_pq: + spin_unlock(&fpq->lock); err_finish: fuse_copy_finish(cs); return err; @@ -1967,16 +1958,17 @@ static ssize_t fuse_dev_do_write(struct fuse_conn *fc, static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) { struct fuse_copy_state cs; - struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp); - if (!fc) + struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp); + + if (!fud) return -EPERM; if (!iter_is_iovec(from)) return -EINVAL; - fuse_copy_init(&cs, fc, 0, from); + fuse_copy_init(&cs, 0, from); - return fuse_dev_do_write(fc, &cs, iov_iter_count(from)); + return fuse_dev_do_write(fud, &cs, iov_iter_count(from)); } static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, @@ -1987,12 +1979,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, unsigned idx; struct pipe_buffer *bufs; struct fuse_copy_state cs; - struct fuse_conn *fc; + struct fuse_dev *fud; size_t rem; ssize_t ret; - fc = fuse_get_conn(out); - if (!fc) + fud = fuse_get_dev(out); + if (!fud) return -EPERM; bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL); @@ -2039,7 +2031,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, } pipe_unlock(pipe); - fuse_copy_init(&cs, fc, 0, NULL); + fuse_copy_init(&cs, 0, NULL); cs.pipebufs = bufs; cs.nr_segs = nbuf; cs.pipe = pipe; @@ -2047,7 +2039,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, if (flags & SPLICE_F_MOVE) cs.move_pages = 1; - ret = fuse_dev_do_write(fc, &cs, len); + ret = fuse_dev_do_write(fud, &cs, len); for (idx = 0; idx < nbuf; idx++) { struct pipe_buffer *buf = &bufs[idx]; @@ -2061,18 +2053,21 @@ out: static unsigned fuse_dev_poll(struct file *file, poll_table *wait) { unsigned mask = POLLOUT | POLLWRNORM; - struct fuse_conn *fc = fuse_get_conn(file); - if (!fc) + struct fuse_iqueue *fiq; + struct fuse_dev *fud = fuse_get_dev(file); + + if (!fud) return POLLERR; - poll_wait(file, &fc->waitq, wait); + fiq = &fud->fc->iq; + poll_wait(file, &fiq->waitq, wait); - spin_lock(&fc->lock); - if (!fc->connected) + spin_lock(&fiq->waitq.lock); + if (!fiq->connected) mask = POLLERR; - else if (request_pending(fc)) + else if (request_pending(fiq)) mask |= POLLIN | POLLRDNORM; - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); return mask; } @@ -2083,67 +2078,18 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait) * This function releases and reacquires fc->lock */ static void end_requests(struct fuse_conn *fc, struct list_head *head) -__releases(fc->lock) -__acquires(fc->lock) { while (!list_empty(head)) { struct fuse_req *req; req = list_entry(head->next, struct fuse_req, list); req->out.h.error = -ECONNABORTED; - request_end(fc, req); - spin_lock(&fc->lock); - } -} - -/* - * Abort requests under I/O - * - * The requests are set to aborted and finished, and the request - * waiter is woken up. This will make request_wait_answer() wait - * until the request is unlocked and then return. - * - * If the request is asynchronous, then the end function needs to be - * called after waiting for the request to be unlocked (if it was - * locked). - */ -static void end_io_requests(struct fuse_conn *fc) -__releases(fc->lock) -__acquires(fc->lock) -{ - while (!list_empty(&fc->io)) { - struct fuse_req *req = - list_entry(fc->io.next, struct fuse_req, list); - void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; - - req->aborted = 1; - req->out.h.error = -ECONNABORTED; - req->state = FUSE_REQ_FINISHED; + clear_bit(FR_PENDING, &req->flags); + clear_bit(FR_SENT, &req->flags); list_del_init(&req->list); - wake_up(&req->waitq); - if (end) { - req->end = NULL; - __fuse_get_request(req); - spin_unlock(&fc->lock); - wait_event(req->waitq, !req->locked); - end(fc, req); - fuse_put_request(fc, req); - spin_lock(&fc->lock); - } + request_end(fc, req); } } -static void end_queued_requests(struct fuse_conn *fc) -__releases(fc->lock) -__acquires(fc->lock) -{ - fc->max_background = UINT_MAX; - flush_bg_queue(fc); - end_requests(fc, &fc->pending); - end_requests(fc, &fc->processing); - while (forget_pending(fc)) - kfree(dequeue_forget(fc, 1, NULL)); -} - static void end_polls(struct fuse_conn *fc) { struct rb_node *p; @@ -2162,67 +2108,164 @@ static void end_polls(struct fuse_conn *fc) /* * Abort all requests. * - * Emergency exit in case of a malicious or accidental deadlock, or - * just a hung filesystem. - * - * The same effect is usually achievable through killing the - * filesystem daemon and all users of the filesystem. The exception - * is the combination of an asynchronous request and the tricky - * deadlock (see Documentation/filesystems/fuse.txt). + * Emergency exit in case of a malicious or accidental deadlock, or just a hung + * filesystem. * - * During the aborting, progression of requests from the pending and - * processing lists onto the io list, and progression of new requests - * onto the pending list is prevented by req->connected being false. + * The same effect is usually achievable through killing the filesystem daemon + * and all users of the filesystem. The exception is the combination of an + * asynchronous request and the tricky deadlock (see + * Documentation/filesystems/fuse.txt). * - * Progression of requests under I/O to the processing list is - * prevented by the req->aborted flag being true for these requests. - * For this reason requests on the io list must be aborted first. + * Aborting requests under I/O goes as follows: 1: Separate out unlocked + * requests, they should be finished off immediately. Locked requests will be + * finished after unlock; see unlock_request(). 2: Finish off the unlocked + * requests. It is possible that some request will finish before we can. This + * is OK, the request will in that case be removed from the list before we touch + * it. */ void fuse_abort_conn(struct fuse_conn *fc) { + struct fuse_iqueue *fiq = &fc->iq; + spin_lock(&fc->lock); if (fc->connected) { + struct fuse_dev *fud; + struct fuse_req *req, *next; + LIST_HEAD(to_end1); + LIST_HEAD(to_end2); + fc->connected = 0; fc->blocked = 0; fuse_set_initialized(fc); - end_io_requests(fc); - end_queued_requests(fc); + list_for_each_entry(fud, &fc->devices, entry) { + struct fuse_pqueue *fpq = &fud->pq; + + spin_lock(&fpq->lock); + fpq->connected = 0; + list_for_each_entry_safe(req, next, &fpq->io, list) { + req->out.h.error = -ECONNABORTED; + spin_lock(&req->waitq.lock); + set_bit(FR_ABORTED, &req->flags); + if (!test_bit(FR_LOCKED, &req->flags)) { + set_bit(FR_PRIVATE, &req->flags); + list_move(&req->list, &to_end1); + } + spin_unlock(&req->waitq.lock); + } + list_splice_init(&fpq->processing, &to_end2); + spin_unlock(&fpq->lock); + } + fc->max_background = UINT_MAX; + flush_bg_queue(fc); + + spin_lock(&fiq->waitq.lock); + fiq->connected = 0; + list_splice_init(&fiq->pending, &to_end2); + while (forget_pending(fiq)) + kfree(dequeue_forget(fiq, 1, NULL)); + wake_up_all_locked(&fiq->waitq); + spin_unlock(&fiq->waitq.lock); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); end_polls(fc); - wake_up_all(&fc->waitq); wake_up_all(&fc->blocked_waitq); - kill_fasync(&fc->fasync, SIGIO, POLL_IN); + spin_unlock(&fc->lock); + + while (!list_empty(&to_end1)) { + req = list_first_entry(&to_end1, struct fuse_req, list); + __fuse_get_request(req); + list_del_init(&req->list); + request_end(fc, req); + } + end_requests(fc, &to_end2); + } else { + spin_unlock(&fc->lock); } - spin_unlock(&fc->lock); } EXPORT_SYMBOL_GPL(fuse_abort_conn); int fuse_dev_release(struct inode *inode, struct file *file) { - struct fuse_conn *fc = fuse_get_conn(file); - if (fc) { - spin_lock(&fc->lock); - fc->connected = 0; - fc->blocked = 0; - fuse_set_initialized(fc); - end_queued_requests(fc); - end_polls(fc); - wake_up_all(&fc->blocked_waitq); - spin_unlock(&fc->lock); - fuse_conn_put(fc); - } + struct fuse_dev *fud = fuse_get_dev(file); + + if (fud) { + struct fuse_conn *fc = fud->fc; + struct fuse_pqueue *fpq = &fud->pq; + WARN_ON(!list_empty(&fpq->io)); + end_requests(fc, &fpq->processing); + /* Are we the last open device? */ + if (atomic_dec_and_test(&fc->dev_count)) { + WARN_ON(fc->iq.fasync != NULL); + fuse_abort_conn(fc); + } + fuse_dev_free(fud); + } return 0; } EXPORT_SYMBOL_GPL(fuse_dev_release); static int fuse_dev_fasync(int fd, struct file *file, int on) { - struct fuse_conn *fc = fuse_get_conn(file); - if (!fc) + struct fuse_dev *fud = fuse_get_dev(file); + + if (!fud) return -EPERM; /* No locking - fasync_helper does its own locking */ - return fasync_helper(fd, file, on, &fc->fasync); + return fasync_helper(fd, file, on, &fud->fc->iq.fasync); +} + +static int fuse_device_clone(struct fuse_conn *fc, struct file *new) +{ + struct fuse_dev *fud; + + if (new->private_data) + return -EINVAL; + + fud = fuse_dev_alloc(fc); + if (!fud) + return -ENOMEM; + + new->private_data = fud; + atomic_inc(&fc->dev_count); + + return 0; +} + +static long fuse_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int err = -ENOTTY; + + if (cmd == FUSE_DEV_IOC_CLONE) { + int oldfd; + + err = -EFAULT; + if (!get_user(oldfd, (__u32 __user *) arg)) { + struct file *old = fget(oldfd); + + err = -EINVAL; + if (old) { + struct fuse_dev *fud = NULL; + + /* + * Check against file->f_op because CUSE + * uses the same ioctl handler. + */ + if (old->f_op == file->f_op && + old->f_cred->user_ns == file->f_cred->user_ns) + fud = fuse_get_dev(old); + + if (fud) { + mutex_lock(&fuse_mutex); + err = fuse_device_clone(fud->fc, file); + mutex_unlock(&fuse_mutex); + } + fput(old); + } + } + } + return err; } const struct file_operations fuse_dev_operations = { @@ -2236,6 +2279,8 @@ const struct file_operations fuse_dev_operations = { .poll = fuse_dev_poll, .release = fuse_dev_release, .fasync = fuse_dev_fasync, + .unlocked_ioctl = fuse_dev_ioctl, + .compat_ioctl = fuse_dev_ioctl, }; EXPORT_SYMBOL_GPL(fuse_dev_operations); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8c5e2fa68835..f523f2f04c19 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -96,17 +96,17 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) * Drop the release request when client does not * implement 'open' */ - req->background = 0; + __clear_bit(FR_BACKGROUND, &req->flags); iput(req->misc.release.inode); fuse_put_request(ff->fc, req); } else if (sync) { - req->background = 0; + __clear_bit(FR_BACKGROUND, &req->flags); fuse_request_send(ff->fc, req); iput(req->misc.release.inode); fuse_put_request(ff->fc, req); } else { req->end = fuse_release_end; - req->background = 1; + __set_bit(FR_BACKGROUND, &req->flags); fuse_request_send_background(ff->fc, req); } kfree(ff); @@ -299,8 +299,8 @@ void fuse_sync_release(struct fuse_file *ff, int flags) { WARN_ON(atomic_read(&ff->count) > 1); fuse_prepare_release(ff, flags, FUSE_RELEASE); - ff->reserved_req->force = 1; - ff->reserved_req->background = 0; + __set_bit(FR_FORCE, &ff->reserved_req->flags); + __clear_bit(FR_BACKGROUND, &ff->reserved_req->flags); fuse_request_send(ff->fc, ff->reserved_req); fuse_put_request(ff->fc, ff->reserved_req); kfree(ff); @@ -426,7 +426,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) req->in.numargs = 1; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; - req->force = 1; + __set_bit(FR_FORCE, &req->flags); fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); @@ -1169,7 +1169,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (err <= 0) goto out; - err = file_remove_suid(file); + err = file_remove_privs(file); if (err) goto out; @@ -1611,7 +1611,8 @@ static int fuse_writepage_locked(struct page *page) if (!req) goto err; - req->background = 1; /* writeback always goes to bg_queue */ + /* writeback always goes to bg_queue */ + __set_bit(FR_BACKGROUND, &req->flags); tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (!tmp_page) goto err_free; @@ -1742,8 +1743,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, } } - if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT || - old_req->state == FUSE_REQ_PENDING)) { + if (old_req->num_pages == 1 && test_bit(FR_PENDING, &old_req->flags)) { struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host); copy_highpage(old_req->pages[0], page); @@ -1830,7 +1830,7 @@ static int fuse_writepages_fill(struct page *page, req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; req->misc.write.next = NULL; req->in.argpages = 1; - req->background = 1; + __set_bit(FR_BACKGROUND, &req->flags); req->num_pages = 0; req->end = fuse_writepage_end; req->inode = inode; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 7354dc142a50..405113101db8 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -241,16 +241,6 @@ struct fuse_args { #define FUSE_ARGS(args) struct fuse_args args = {} -/** The request state */ -enum fuse_req_state { - FUSE_REQ_INIT = 0, - FUSE_REQ_PENDING, - FUSE_REQ_READING, - FUSE_REQ_SENT, - FUSE_REQ_WRITING, - FUSE_REQ_FINISHED -}; - /** The request IO state (for asynchronous processing) */ struct fuse_io_priv { int async; @@ -267,7 +257,40 @@ struct fuse_io_priv { }; /** + * Request flags + * + * FR_ISREPLY: set if the request has reply + * FR_FORCE: force sending of the request even if interrupted + * FR_BACKGROUND: request is sent in the background + * FR_WAITING: request is counted as "waiting" + * FR_ABORTED: the request was aborted + * FR_INTERRUPTED: the request has been interrupted + * FR_LOCKED: data is being copied to/from the request + * FR_PENDING: request is not yet in userspace + * FR_SENT: request is in userspace, waiting for an answer + * FR_FINISHED: request is finished + * FR_PRIVATE: request is on private list + */ +enum fuse_req_flag { + FR_ISREPLY, + FR_FORCE, + FR_BACKGROUND, + FR_WAITING, + FR_ABORTED, + FR_INTERRUPTED, + FR_LOCKED, + FR_PENDING, + FR_SENT, + FR_FINISHED, + FR_PRIVATE, +}; + +/** * A request to the client + * + * .waitq.lock protects the following fields: + * - FR_ABORTED + * - FR_LOCKED (may also be modified under fc->lock, tested under both) */ struct fuse_req { /** This can be on either pending processing or io lists in @@ -283,35 +306,8 @@ struct fuse_req { /** Unique ID for the interrupt request */ u64 intr_unique; - /* - * The following bitfields are either set once before the - * request is queued or setting/clearing them is protected by - * fuse_conn->lock - */ - - /** True if the request has reply */ - unsigned isreply:1; - - /** Force sending of the request even if interrupted */ - unsigned force:1; - - /** The request was aborted */ - unsigned aborted:1; - - /** Request is sent in the background */ - unsigned background:1; - - /** The request has been interrupted */ - unsigned interrupted:1; - - /** Data is being copied to/from the request */ - unsigned locked:1; - - /** Request is counted as "waiting" */ - unsigned waiting:1; - - /** State of the request */ - enum fuse_req_state state; + /* Request flags, updated with test/set/clear_bit() */ + unsigned long flags; /** The request input */ struct fuse_in in; @@ -380,6 +376,61 @@ struct fuse_req { struct file *stolen_file; }; +struct fuse_iqueue { + /** Connection established */ + unsigned connected; + + /** Readers of the connection are waiting on this */ + wait_queue_head_t waitq; + + /** The next unique request id */ + u64 reqctr; + + /** The list of pending requests */ + struct list_head pending; + + /** Pending interrupts */ + struct list_head interrupts; + + /** Queue of pending forgets */ + struct fuse_forget_link forget_list_head; + struct fuse_forget_link *forget_list_tail; + + /** Batching of FORGET requests (positive indicates FORGET batch) */ + int forget_batch; + + /** O_ASYNC requests */ + struct fasync_struct *fasync; +}; + +struct fuse_pqueue { + /** Connection established */ + unsigned connected; + + /** Lock protecting accessess to members of this structure */ + spinlock_t lock; + + /** The list of requests being processed */ + struct list_head processing; + + /** The list of requests under I/O */ + struct list_head io; +}; + +/** + * Fuse device instance + */ +struct fuse_dev { + /** Fuse connection for this device */ + struct fuse_conn *fc; + + /** Processing queue */ + struct fuse_pqueue pq; + + /** list entry on fc->devices */ + struct list_head entry; +}; + /** * A Fuse connection. * @@ -394,6 +445,9 @@ struct fuse_conn { /** Refcount */ atomic_t count; + /** Number of fuse_dev's */ + atomic_t dev_count; + struct rcu_head rcu; /** The user id for this mount */ @@ -411,17 +465,8 @@ struct fuse_conn { /** Maximum write size */ unsigned max_write; - /** Readers of the connection are waiting on this */ - wait_queue_head_t waitq; - - /** The list of pending requests */ - struct list_head pending; - - /** The list of requests being processed */ - struct list_head processing; - - /** The list of requests under I/O */ - struct list_head io; + /** Input queue */ + struct fuse_iqueue iq; /** The next unique kernel file handle */ u64 khctr; @@ -444,16 +489,6 @@ struct fuse_conn { /** The list of background requests set aside for later queuing */ struct list_head bg_queue; - /** Pending interrupts */ - struct list_head interrupts; - - /** Queue of pending forgets */ - struct fuse_forget_link forget_list_head; - struct fuse_forget_link *forget_list_tail; - - /** Batching of FORGET requests (positive indicates FORGET batch) */ - int forget_batch; - /** Flag indicating that INIT reply has been received. Allocating * any fuse request will be suspended until the flag is set */ int initialized; @@ -469,9 +504,6 @@ struct fuse_conn { /** waitq for reserved requests */ wait_queue_head_t reserved_req_waitq; - /** The next unique request id */ - u64 reqctr; - /** Connection established, cleared on umount, connection abort and device release */ unsigned connected; @@ -594,9 +626,6 @@ struct fuse_conn { /** number of dentries used in the above array */ int ctl_ndents; - /** O_ASYNC requests */ - struct fasync_struct *fasync; - /** Key for lock owner ID scrambling */ u32 scramble_key[4]; @@ -614,6 +643,9 @@ struct fuse_conn { /** Read/write semaphore to hold when accessing sb. */ struct rw_semaphore killsb; + + /** List of device instances belonging to this connection */ + struct list_head devices; }; static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) @@ -826,6 +858,9 @@ void fuse_conn_init(struct fuse_conn *fc); */ void fuse_conn_put(struct fuse_conn *fc); +struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc); +void fuse_dev_free(struct fuse_dev *fud); + /** * Add connection to control filesystem */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 082ac1c97f39..2913db2a5b99 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -362,8 +362,8 @@ static void fuse_send_destroy(struct fuse_conn *fc) if (req && fc->conn_init) { fc->destroy_req = NULL; req->in.h.opcode = FUSE_DESTROY; - req->force = 1; - req->background = 0; + __set_bit(FR_FORCE, &req->flags); + __clear_bit(FR_BACKGROUND, &req->flags); fuse_request_send(fc, req); fuse_put_request(fc, req); } @@ -567,30 +567,46 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) return 0; } +static void fuse_iqueue_init(struct fuse_iqueue *fiq) +{ + memset(fiq, 0, sizeof(struct fuse_iqueue)); + init_waitqueue_head(&fiq->waitq); + INIT_LIST_HEAD(&fiq->pending); + INIT_LIST_HEAD(&fiq->interrupts); + fiq->forget_list_tail = &fiq->forget_list_head; + fiq->connected = 1; +} + +static void fuse_pqueue_init(struct fuse_pqueue *fpq) +{ + memset(fpq, 0, sizeof(struct fuse_pqueue)); + spin_lock_init(&fpq->lock); + INIT_LIST_HEAD(&fpq->processing); + INIT_LIST_HEAD(&fpq->io); + fpq->connected = 1; +} + void fuse_conn_init(struct fuse_conn *fc) { memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); init_rwsem(&fc->killsb); atomic_set(&fc->count, 1); - init_waitqueue_head(&fc->waitq); + atomic_set(&fc->dev_count, 1); init_waitqueue_head(&fc->blocked_waitq); init_waitqueue_head(&fc->reserved_req_waitq); - INIT_LIST_HEAD(&fc->pending); - INIT_LIST_HEAD(&fc->processing); - INIT_LIST_HEAD(&fc->io); - INIT_LIST_HEAD(&fc->interrupts); + fuse_iqueue_init(&fc->iq); INIT_LIST_HEAD(&fc->bg_queue); INIT_LIST_HEAD(&fc->entry); - fc->forget_list_tail = &fc->forget_list_head; + INIT_LIST_HEAD(&fc->devices); atomic_set(&fc->num_waiting, 0); fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; fc->khctr = 0; fc->polled_files = RB_ROOT; - fc->reqctr = 0; fc->blocked = 0; fc->initialized = 0; + fc->connected = 1; fc->attr_version = 1; get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); } @@ -930,6 +946,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) static void fuse_free_conn(struct fuse_conn *fc) { + WARN_ON(!list_empty(&fc->devices)); kfree_rcu(fc, rcu); } @@ -975,8 +992,42 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) return 0; } +struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc) +{ + struct fuse_dev *fud; + + fud = kzalloc(sizeof(struct fuse_dev), GFP_KERNEL); + if (fud) { + fud->fc = fuse_conn_get(fc); + fuse_pqueue_init(&fud->pq); + + spin_lock(&fc->lock); + list_add_tail(&fud->entry, &fc->devices); + spin_unlock(&fc->lock); + } + + return fud; +} +EXPORT_SYMBOL_GPL(fuse_dev_alloc); + +void fuse_dev_free(struct fuse_dev *fud) +{ + struct fuse_conn *fc = fud->fc; + + if (fc) { + spin_lock(&fc->lock); + list_del(&fud->entry); + spin_unlock(&fc->lock); + + fuse_conn_put(fc); + } + kfree(fud); +} +EXPORT_SYMBOL_GPL(fuse_dev_free); + static int fuse_fill_super(struct super_block *sb, void *data, int silent) { + struct fuse_dev *fud; struct fuse_conn *fc; struct inode *root; struct fuse_mount_data d; @@ -1026,12 +1077,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) goto err_fput; fuse_conn_init(fc); + fc->release = fuse_free_conn; + + fud = fuse_dev_alloc(fc); + if (!fud) + goto err_put_conn; fc->dev = sb->s_dev; fc->sb = sb; err = fuse_bdi_init(fc, sb); if (err) - goto err_put_conn; + goto err_dev_free; sb->s_bdi = &fc->bdi; @@ -1040,7 +1096,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->dont_mask = 1; sb->s_flags |= MS_POSIXACL; - fc->release = fuse_free_conn; fc->flags = d.flags; fc->user_id = d.user_id; fc->group_id = d.group_id; @@ -1053,14 +1108,14 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) root = fuse_get_root_inode(sb, d.rootmode); root_dentry = d_make_root(root); if (!root_dentry) - goto err_put_conn; + goto err_dev_free; /* only now - we want root dentry with NULL ->d_op */ sb->s_d_op = &fuse_dentry_operations; init_req = fuse_request_alloc(0); if (!init_req) goto err_put_root; - init_req->background = 1; + __set_bit(FR_BACKGROUND, &init_req->flags); if (is_bdev) { fc->destroy_req = fuse_request_alloc(0); @@ -1079,8 +1134,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - fc->connected = 1; - file->private_data = fuse_conn_get(fc); + file->private_data = fud; mutex_unlock(&fuse_mutex); /* * atomic_dec_and_test() in fput() provides the necessary @@ -1099,6 +1153,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fuse_request_free(init_req); err_put_root: dput(root_dentry); + err_dev_free: + fuse_dev_free(fud); err_put_conn: fuse_bdi_destroy(fc); fuse_conn_put(fc); @@ -1238,7 +1294,6 @@ static void fuse_fs_cleanup(void) } static struct kobject *fuse_kobj; -static struct kobject *connections_kobj; static int fuse_sysfs_init(void) { @@ -1250,11 +1305,9 @@ static int fuse_sysfs_init(void) goto out_err; } - connections_kobj = kobject_create_and_add("connections", fuse_kobj); - if (!connections_kobj) { - err = -ENOMEM; + err = sysfs_create_mount_point(fuse_kobj, "connections"); + if (err) goto out_fuse_unregister; - } return 0; @@ -1266,7 +1319,7 @@ static int fuse_sysfs_init(void) static void fuse_sysfs_cleanup(void) { - kobject_put(connections_kobj); + sysfs_remove_mount_point(fuse_kobj, "connections"); kobject_put(fuse_kobj); } diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 7833394a9a20..d5369a109781 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -202,22 +202,22 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, * */ -static void gfs2_end_log_write(struct bio *bio, int error) +static void gfs2_end_log_write(struct bio *bio) { struct gfs2_sbd *sdp = bio->bi_private; struct bio_vec *bvec; struct page *page; int i; - if (error) { - sdp->sd_log_error = error; - fs_err(sdp, "Error %d writing to log\n", error); + if (bio->bi_error) { + sdp->sd_log_error = bio->bi_error; + fs_err(sdp, "Error %d writing to log\n", bio->bi_error); } bio_for_each_segment_all(bvec, bio, i) { page = bvec->bv_page; if (page_has_buffers(page)) - gfs2_end_log_write_bh(sdp, bvec, error); + gfs2_end_log_write_bh(sdp, bvec, bio->bi_error); else mempool_free(page, gfs2_page_pool); } @@ -261,18 +261,11 @@ void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw) static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) { struct super_block *sb = sdp->sd_vfs; - unsigned nrvecs = bio_get_nr_vecs(sb->s_bdev); struct bio *bio; BUG_ON(sdp->sd_log_bio); - while (1) { - bio = bio_alloc(GFP_NOIO, nrvecs); - if (likely(bio)) - break; - nrvecs = max(nrvecs/2, 1U); - } - + bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9); bio->bi_bdev = sb->s_bdev; bio->bi_end_io = gfs2_end_log_write; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 1e3a93f2f71d..02586e7eb964 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -171,14 +171,14 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) return -EINVAL; } -static void end_bio_io_page(struct bio *bio, int error) +static void end_bio_io_page(struct bio *bio) { struct page *page = bio->bi_private; - if (!error) + if (!bio->bi_error) SetPageUptodate(page); else - pr_warn("error %d reading superblock\n", error); + pr_warn("error %d reading superblock\n", bio->bi_error); unlock_page(page); } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 2982445947e1..894fb01a91da 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1334,11 +1334,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) if (is_ancestor(root, sdp->sd_master_dir)) seq_puts(s, ",meta"); if (args->ar_lockproto[0]) - seq_printf(s, ",lockproto=%s", args->ar_lockproto); + seq_show_option(s, "lockproto", args->ar_lockproto); if (args->ar_locktable[0]) - seq_printf(s, ",locktable=%s", args->ar_locktable); + seq_show_option(s, "locktable", args->ar_locktable); if (args->ar_hostdata[0]) - seq_printf(s, ",hostdata=%s", args->ar_hostdata); + seq_show_option(s, "hostdata", args->ar_hostdata); if (args->ar_spectator) seq_puts(s, ",spectator"); if (args->ar_localflocks) diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index d3fa6bd9503e..221719eac5de 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -288,7 +288,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) page_cache_release(page); goto fail; } - page_cache_release(page); node->page[i] = page; } @@ -398,11 +397,11 @@ node_error: void hfs_bnode_free(struct hfs_bnode *node) { - //int i; + int i; - //for (i = 0; i < node->tree->pages_per_bnode; i++) - // if (node->page[i]) - // page_cache_release(node->page[i]); + for (i = 0; i < node->tree->pages_per_bnode; i++) + if (node->page[i]) + page_cache_release(node->page[i]); kfree(node); } diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c index 9f4ee7f52026..6fc766df0461 100644 --- a/fs/hfs/brec.c +++ b/fs/hfs/brec.c @@ -131,13 +131,16 @@ skip: hfs_bnode_write(node, entry, data_off + key_len, entry_len); hfs_bnode_dump(node); - if (new_node) { - /* update parent key if we inserted a key - * at the start of the first node - */ - if (!rec && new_node != node) - hfs_brec_update_parent(fd); + /* + * update parent key if we inserted a key + * at the start of the node and it is not the new node + */ + if (!rec && new_node != node) { + hfs_bnode_read_key(node, fd->search_key, data_off + size); + hfs_brec_update_parent(fd); + } + if (new_node) { hfs_bnode_put(fd->bnode); if (!new_node->parent) { hfs_btree_inc_height(tree); @@ -166,9 +169,6 @@ skip: goto again; } - if (!rec) - hfs_brec_update_parent(fd); - return 0; } @@ -366,6 +366,8 @@ again: if (IS_ERR(parent)) return PTR_ERR(parent); __hfs_brec_find(parent, fd); + if (fd->record < 0) + return -ENOENT; hfs_bnode_dump(parent); rec = fd->record; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 95d255219b1e..1f1c7dcbcc2f 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -252,7 +252,7 @@ extern void hfs_mark_mdb_dirty(struct super_block *sb); #define __hfs_u_to_mtime(sec) cpu_to_be32(sec + 2082844800U - sys_tz.tz_minuteswest * 60) #define __hfs_m_to_utime(sec) (be32_to_cpu(sec) - 2082844800U + sys_tz.tz_minuteswest * 60) -#define HFS_I(inode) (list_entry(inode, struct hfs_inode_info, vfs_inode)) +#define HFS_I(inode) (container_of(inode, struct hfs_inode_info, vfs_inode)) #define HFS_SB(sb) ((struct hfs_sb_info *)(sb)->s_fs_info) #define hfs_m_to_utime(time) (struct timespec){ .tv_sec = __hfs_m_to_utime(time) } diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 55c03b9e9070..4574fdd3d421 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -136,9 +136,9 @@ static int hfs_show_options(struct seq_file *seq, struct dentry *root) struct hfs_sb_info *sbi = HFS_SB(root->d_sb); if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f)) - seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator); + seq_show_option_n(seq, "creator", (char *)&sbi->s_creator, 4); if (sbi->s_type != cpu_to_be32(0x3f3f3f3f)) - seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type); + seq_show_option_n(seq, "type", (char *)&sbi->s_type, 4); seq_printf(seq, ",uid=%u,gid=%u", from_kuid_munged(&init_user_ns, sbi->s_uid), from_kgid_munged(&init_user_ns, sbi->s_gid)); diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 759708fd9331..63924662aaf3 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -454,7 +454,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) page_cache_release(page); goto fail; } - page_cache_release(page); node->page[i] = page; } @@ -566,13 +565,11 @@ node_error: void hfs_bnode_free(struct hfs_bnode *node) { -#if 0 int i; for (i = 0; i < node->tree->pages_per_bnode; i++) if (node->page[i]) page_cache_release(node->page[i]); -#endif kfree(node); } diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index b0441d65fa54..f91a1faf819e 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -263,7 +263,7 @@ struct hfsplus_inode_info { static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode) { - return list_entry(inode, struct hfsplus_inode_info, vfs_inode); + return container_of(inode, struct hfsplus_inode_info, vfs_inode); } /* diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index c90b72ee676d..bb806e58c977 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -218,9 +218,9 @@ int hfsplus_show_options(struct seq_file *seq, struct dentry *root) struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb); if (sbi->creator != HFSPLUS_DEF_CR_TYPE) - seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); + seq_show_option_n(seq, "creator", (char *)&sbi->creator, 4); if (sbi->type != HFSPLUS_DEF_CR_TYPE) - seq_printf(seq, ",type=%.4s", (char *)&sbi->type); + seq_show_option_n(seq, "type", (char *)&sbi->type, 4); seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, from_kuid_munged(&init_user_ns, sbi->uid), from_kgid_munged(&init_user_ns, sbi->gid)); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 059597b23f67..2ac99db3750e 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -260,7 +260,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root) size_t offset = strlen(root_ino) + 1; if (strlen(root_path) > offset) - seq_printf(seq, ",%s", root_path + offset); + seq_show_option(seq, root_path + offset, NULL); if (append) seq_puts(seq, ",append"); diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c index f005046e1591..d6a4b55d2ab0 100644 --- a/fs/hpfs/alloc.c +++ b/fs/hpfs/alloc.c @@ -484,3 +484,98 @@ struct anode *hpfs_alloc_anode(struct super_block *s, secno near, anode_secno *a a->btree.first_free = cpu_to_le16(8); return a; } + +static unsigned find_run(__le32 *bmp, unsigned *idx) +{ + unsigned len; + while (tstbits(bmp, *idx, 1)) { + (*idx)++; + if (unlikely(*idx >= 0x4000)) + return 0; + } + len = 1; + while (!tstbits(bmp, *idx + len, 1)) + len++; + return len; +} + +static int do_trim(struct super_block *s, secno start, unsigned len, secno limit_start, secno limit_end, unsigned minlen, unsigned *result) +{ + int err; + secno end; + if (fatal_signal_pending(current)) + return -EINTR; + end = start + len; + if (start < limit_start) + start = limit_start; + if (end > limit_end) + end = limit_end; + if (start >= end) + return 0; + if (end - start < minlen) + return 0; + err = sb_issue_discard(s, start, end - start, GFP_NOFS, 0); + if (err) + return err; + *result += end - start; + return 0; +} + +int hpfs_trim_fs(struct super_block *s, u64 start, u64 end, u64 minlen, unsigned *result) +{ + int err = 0; + struct hpfs_sb_info *sbi = hpfs_sb(s); + unsigned idx, len, start_bmp, end_bmp; + __le32 *bmp; + struct quad_buffer_head qbh; + + *result = 0; + if (!end || end > sbi->sb_fs_size) + end = sbi->sb_fs_size; + if (start >= sbi->sb_fs_size) + return 0; + if (minlen > 0x4000) + return 0; + if (start < sbi->sb_dirband_start + sbi->sb_dirband_size && end > sbi->sb_dirband_start) { + hpfs_lock(s); + if (s->s_flags & MS_RDONLY) { + err = -EROFS; + goto unlock_1; + } + if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) { + err = -EIO; + goto unlock_1; + } + idx = 0; + while ((len = find_run(bmp, &idx)) && !err) { + err = do_trim(s, sbi->sb_dirband_start + idx * 4, len * 4, start, end, minlen, result); + idx += len; + } + hpfs_brelse4(&qbh); +unlock_1: + hpfs_unlock(s); + } + start_bmp = start >> 14; + end_bmp = (end + 0x3fff) >> 14; + while (start_bmp < end_bmp && !err) { + hpfs_lock(s); + if (s->s_flags & MS_RDONLY) { + err = -EROFS; + goto unlock_2; + } + if (!(bmp = hpfs_map_bitmap(s, start_bmp, &qbh, "trim"))) { + err = -EIO; + goto unlock_2; + } + idx = 0; + while ((len = find_run(bmp, &idx)) && !err) { + err = do_trim(s, (start_bmp << 14) + idx, len, start, end, minlen, result); + idx += len; + } + hpfs_brelse4(&qbh); +unlock_2: + hpfs_unlock(s); + start_bmp++; + } + return err; +} diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c index 8057fe4e6574..f626114449e4 100644 --- a/fs/hpfs/buffer.c +++ b/fs/hpfs/buffer.c @@ -10,6 +10,30 @@ #include <linux/blkdev.h> #include "hpfs_fn.h" +secno hpfs_search_hotfix_map(struct super_block *s, secno sec) +{ + unsigned i; + struct hpfs_sb_info *sbi = hpfs_sb(s); + for (i = 0; unlikely(i < sbi->n_hotfixes); i++) { + if (sbi->hotfix_from[i] == sec) { + return sbi->hotfix_to[i]; + } + } + return sec; +} + +unsigned hpfs_search_hotfix_map_for_range(struct super_block *s, secno sec, unsigned n) +{ + unsigned i; + struct hpfs_sb_info *sbi = hpfs_sb(s); + for (i = 0; unlikely(i < sbi->n_hotfixes); i++) { + if (sbi->hotfix_from[i] >= sec && sbi->hotfix_from[i] < sec + n) { + n = sbi->hotfix_from[i] - sec; + } + } + return n; +} + void hpfs_prefetch_sectors(struct super_block *s, unsigned secno, int n) { struct buffer_head *bh; @@ -18,6 +42,9 @@ void hpfs_prefetch_sectors(struct super_block *s, unsigned secno, int n) if (n <= 0 || unlikely(secno >= hpfs_sb(s)->sb_fs_size)) return; + if (unlikely(hpfs_search_hotfix_map_for_range(s, secno, n) != n)) + return; + bh = sb_find_get_block(s, secno); if (bh) { if (buffer_uptodate(bh)) { @@ -51,7 +78,7 @@ void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head cond_resched(); - *bhp = bh = sb_bread(s, secno); + *bhp = bh = sb_bread(s, hpfs_search_hotfix_map(s, secno)); if (bh != NULL) return bh->b_data; else { @@ -71,7 +98,7 @@ void *hpfs_get_sector(struct super_block *s, unsigned secno, struct buffer_head cond_resched(); - if ((*bhp = bh = sb_getblk(s, secno)) != NULL) { + if ((*bhp = bh = sb_getblk(s, hpfs_search_hotfix_map(s, secno))) != NULL) { if (!buffer_uptodate(bh)) wait_on_buffer(bh); set_buffer_uptodate(bh); return bh->b_data; @@ -99,10 +126,10 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe hpfs_prefetch_sectors(s, secno, 4 + ahead); - if (!(qbh->bh[0] = sb_bread(s, secno + 0))) goto bail0; - if (!(qbh->bh[1] = sb_bread(s, secno + 1))) goto bail1; - if (!(qbh->bh[2] = sb_bread(s, secno + 2))) goto bail2; - if (!(qbh->bh[3] = sb_bread(s, secno + 3))) goto bail3; + if (!hpfs_map_sector(s, secno + 0, &qbh->bh[0], 0)) goto bail0; + if (!hpfs_map_sector(s, secno + 1, &qbh->bh[1], 0)) goto bail1; + if (!hpfs_map_sector(s, secno + 2, &qbh->bh[2], 0)) goto bail2; + if (!hpfs_map_sector(s, secno + 3, &qbh->bh[3], 0)) goto bail3; if (likely(qbh->bh[1]->b_data == qbh->bh[0]->b_data + 1 * 512) && likely(qbh->bh[2]->b_data == qbh->bh[0]->b_data + 2 * 512) && diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 2a8e07425de0..dc540bfcee1d 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -327,4 +327,5 @@ const struct file_operations hpfs_dir_ops = .iterate = hpfs_readdir, .release = hpfs_dir_release, .fsync = hpfs_file_fsync, + .unlocked_ioctl = hpfs_ioctl, }; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 6d8cfe9b52d6..d3bcdd975700 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -83,6 +83,11 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he if (s) { if (bh_result->b_size >> 9 < n_secs) n_secs = bh_result->b_size >> 9; + n_secs = hpfs_search_hotfix_map_for_range(inode->i_sb, s, n_secs); + if (unlikely(!n_secs)) { + s = hpfs_search_hotfix_map(inode->i_sb, s); + n_secs = 1; + } map_bh(bh_result, inode->i_sb, s); bh_result->b_size = n_secs << 9; goto ret_0; @@ -101,7 +106,7 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he inode->i_blocks++; hpfs_i(inode)->mmu_private += 512; set_buffer_new(bh_result); - map_bh(bh_result, inode->i_sb, s); + map_bh(bh_result, inode->i_sb, hpfs_search_hotfix_map(inode->i_sb, s)); ret_0: r = 0; ret_r: @@ -181,7 +186,7 @@ static int hpfs_write_end(struct file *file, struct address_space *mapping, static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block) { - return generic_block_bmap(mapping,block,hpfs_get_block); + return generic_block_bmap(mapping, block, hpfs_get_block); } const struct address_space_operations hpfs_aops = { @@ -203,6 +208,7 @@ const struct file_operations hpfs_file_ops = .release = hpfs_file_release, .fsync = hpfs_file_fsync, .splice_read = generic_file_splice_read, + .unlocked_ioctl = hpfs_ioctl, }; const struct inode_operations hpfs_file_iops = diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index b63b75fa00e7..975654a63c13 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -18,6 +18,8 @@ #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/slab.h> +#include <linux/sched.h> +#include <linux/blkdev.h> #include <asm/unaligned.h> #include "hpfs.h" @@ -86,6 +88,10 @@ struct hpfs_sb_info { unsigned sb_max_fwd_alloc; /* max forwad allocation */ int sb_timeshift; struct rcu_head rcu; + + unsigned n_hotfixes; + secno hotfix_from[256]; + secno hotfix_to[256]; }; /* Four 512-byte buffers and the 2k block obtained by concatenating them */ @@ -200,6 +206,7 @@ void hpfs_free_dnode(struct super_block *, secno); struct dnode *hpfs_alloc_dnode(struct super_block *, secno, dnode_secno *, struct quad_buffer_head *); struct fnode *hpfs_alloc_fnode(struct super_block *, secno, fnode_secno *, struct buffer_head **); struct anode *hpfs_alloc_anode(struct super_block *, secno, anode_secno *, struct buffer_head **); +int hpfs_trim_fs(struct super_block *, u64, u64, u64, unsigned *); /* anode.c */ @@ -214,6 +221,8 @@ void hpfs_remove_fnode(struct super_block *, fnode_secno fno); /* buffer.c */ +secno hpfs_search_hotfix_map(struct super_block *s, secno sec); +unsigned hpfs_search_hotfix_map_for_range(struct super_block *s, secno sec, unsigned n); void hpfs_prefetch_sectors(struct super_block *, unsigned, int); void *hpfs_map_sector(struct super_block *, unsigned, struct buffer_head **, int); void *hpfs_get_sector(struct super_block *, unsigned, struct buffer_head **); @@ -282,6 +291,7 @@ __le32 *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head void hpfs_prefetch_bitmap(struct super_block *, unsigned); unsigned char *hpfs_load_code_page(struct super_block *, secno); __le32 *hpfs_load_bitmap_directory(struct super_block *, secno bmp); +void hpfs_load_hotfix_map(struct super_block *s, struct hpfs_spare_block *spareblock); struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **); struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **); struct dnode *hpfs_map_dnode(struct super_block *s, dnode_secno, struct quad_buffer_head *); @@ -304,7 +314,7 @@ extern const struct address_space_operations hpfs_symlink_aops; static inline struct hpfs_inode_info *hpfs_i(struct inode *inode) { - return list_entry(inode, struct hpfs_inode_info, vfs_inode); + return container_of(inode, struct hpfs_inode_info, vfs_inode); } static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb) @@ -318,6 +328,7 @@ __printf(2, 3) void hpfs_error(struct super_block *, const char *, ...); int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *); unsigned hpfs_get_free_dnodes(struct super_block *); +long hpfs_ioctl(struct file *file, unsigned cmd, unsigned long arg); /* * local time (HPFS) to GMT (Unix) diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c index 442770edcdc7..a69bbc1e87f8 100644 --- a/fs/hpfs/map.c +++ b/fs/hpfs/map.c @@ -130,6 +130,32 @@ __le32 *hpfs_load_bitmap_directory(struct super_block *s, secno bmp) return b; } +void hpfs_load_hotfix_map(struct super_block *s, struct hpfs_spare_block *spareblock) +{ + struct quad_buffer_head qbh; + u32 *directory; + u32 n_hotfixes, n_used_hotfixes; + unsigned i; + + n_hotfixes = le32_to_cpu(spareblock->n_spares); + n_used_hotfixes = le32_to_cpu(spareblock->n_spares_used); + + if (n_hotfixes > 256 || n_used_hotfixes > n_hotfixes) { + hpfs_error(s, "invalid number of hotfixes: %u, used: %u", n_hotfixes, n_used_hotfixes); + return; + } + if (!(directory = hpfs_map_4sectors(s, le32_to_cpu(spareblock->hotfix_map), &qbh, 0))) { + hpfs_error(s, "can't load hotfix map"); + return; + } + for (i = 0; i < n_used_hotfixes; i++) { + hpfs_sb(s)->hotfix_from[i] = le32_to_cpu(directory[i]); + hpfs_sb(s)->hotfix_to[i] = le32_to_cpu(directory[n_hotfixes + i]); + } + hpfs_sb(s)->n_hotfixes = n_used_hotfixes; + hpfs_brelse4(&qbh); +} + /* * Load fnode to memory */ diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index a0872f239f04..9e92c9c2d319 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -8,6 +8,17 @@ #include <linux/sched.h> #include "hpfs_fn.h" +static void hpfs_update_directory_times(struct inode *dir) +{ + time_t t = get_seconds(); + if (t == dir->i_mtime.tv_sec && + t == dir->i_ctime.tv_sec) + return; + dir->i_mtime.tv_sec = dir->i_ctime.tv_sec = t; + dir->i_mtime.tv_nsec = dir->i_ctime.tv_nsec = 0; + hpfs_write_inode_nolock(dir); +} + static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { const unsigned char *name = dentry->d_name.name; @@ -99,6 +110,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) result->i_mode = mode | S_IFDIR; hpfs_write_inode_nolock(result); } + hpfs_update_directory_times(dir); d_instantiate(dentry, result); hpfs_unlock(dir->i_sb); return 0; @@ -187,6 +199,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, b result->i_mode = mode | S_IFREG; hpfs_write_inode_nolock(result); } + hpfs_update_directory_times(dir); d_instantiate(dentry, result); hpfs_unlock(dir->i_sb); return 0; @@ -262,6 +275,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, de insert_inode_hash(result); hpfs_write_inode_nolock(result); + hpfs_update_directory_times(dir); d_instantiate(dentry, result); brelse(bh); hpfs_unlock(dir->i_sb); @@ -340,6 +354,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy insert_inode_hash(result); hpfs_write_inode_nolock(result); + hpfs_update_directory_times(dir); d_instantiate(dentry, result); hpfs_unlock(dir->i_sb); return 0; @@ -423,6 +438,8 @@ again: out1: hpfs_brelse4(&qbh); out: + if (!err) + hpfs_update_directory_times(dir); hpfs_unlock(dir->i_sb); return err; } @@ -477,6 +494,8 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) out1: hpfs_brelse4(&qbh); out: + if (!err) + hpfs_update_directory_times(dir); hpfs_unlock(dir->i_sb); return err; } @@ -595,7 +614,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto end1; } - end: +end: hpfs_i(i)->i_parent_dir = new_dir->i_ino; if (S_ISDIR(i->i_mode)) { inc_nlink(new_dir); @@ -610,6 +629,10 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, brelse(bh); } end1: + if (!err) { + hpfs_update_directory_times(old_dir); + hpfs_update_directory_times(new_dir); + } hpfs_unlock(i->i_sb); return err; } diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 7cd00d3a7c9b..a561591896bd 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -52,17 +52,20 @@ static void unmark_dirty(struct super_block *s) } /* Filesystem error... */ -static char err_buf[1024]; - void hpfs_error(struct super_block *s, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - vsnprintf(err_buf, sizeof(err_buf), fmt, args); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("filesystem error: %pV", &vaf); + va_end(args); - pr_err("filesystem error: %s", err_buf); if (!hpfs_sb(s)->sb_was_error) { if (hpfs_sb(s)->sb_err == 2) { pr_cont("; crashing the system because you wanted it\n"); @@ -196,12 +199,39 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } + +long hpfs_ioctl(struct file *file, unsigned cmd, unsigned long arg) +{ + switch (cmd) { + case FITRIM: { + struct fstrim_range range; + secno n_trimmed; + int r; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) + return -EFAULT; + r = hpfs_trim_fs(file_inode(file)->i_sb, range.start >> 9, (range.start + range.len) >> 9, (range.minlen + 511) >> 9, &n_trimmed); + if (r) + return r; + range.len = (u64)n_trimmed << 9; + if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) + return -EFAULT; + return 0; + } + default: { + return -ENOIOCTLCMD; + } + } +} + + static struct kmem_cache * hpfs_inode_cachep; static struct inode *hpfs_alloc_inode(struct super_block *sb) { struct hpfs_inode_info *ei; - ei = (struct hpfs_inode_info *)kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS); + ei = kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS); if (!ei) return NULL; ei->vfs_inode.i_version = 1; @@ -424,11 +454,14 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) int o; struct hpfs_sb_info *sbi = hpfs_sb(s); char *new_opts = kstrdup(data, GFP_KERNEL); - + + if (!new_opts) + return -ENOMEM; + sync_filesystem(s); *flags |= MS_NOATIME; - + hpfs_lock(s); uid = sbi->sb_uid; gid = sbi->sb_gid; umask = 0777 & ~sbi->sb_mode; @@ -595,6 +628,9 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) goto bail4; } + if (spareblock->n_spares_used) + hpfs_load_hotfix_map(s, spareblock); + /* Load bitmap directory */ if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps)))) goto bail4; @@ -614,18 +650,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) mark_buffer_dirty(bh2); } - if (spareblock->hotfixes_used || spareblock->n_spares_used) { - if (errs >= 2) { - pr_err("Hotfixes not supported here, try chkdsk\n"); - mark_dirty(s, 0); - goto bail4; - } - hpfs_error(s, "hotfixes not supported here, try chkdsk"); - if (errs == 0) - pr_err("Proceeding, but your filesystem will be probably corrupted by this driver...\n"); - else - pr_err("This driver may read bad files or crash when operating on disk with hotfixes.\n"); - } if (le32_to_cpu(spareblock->n_dnode_spares) != le32_to_cpu(spareblock->n_dnode_spares_free)) { if (errs >= 2) { pr_err("Spare dnodes used, try chkdsk\n"); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 0cf74df68617..316adb968b65 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -12,6 +12,7 @@ #include <linux/thread_info.h> #include <asm/current.h> #include <linux/sched.h> /* remove ASAP */ +#include <linux/falloc.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/file.h> @@ -84,6 +85,29 @@ static const match_table_t tokens = { {Opt_err, NULL}, }; +#ifdef CONFIG_NUMA +static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index) +{ + vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy, + index); +} + +static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) +{ + mpol_cond_put(vma->vm_policy); +} +#else +static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index) +{ +} + +static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) +{ +} +#endif + static void huge_pagevec_release(struct pagevec *pvec) { int i; @@ -293,26 +317,61 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, return -EINVAL; } -static void truncate_huge_page(struct page *page) +static void remove_huge_page(struct page *page) { ClearPageDirty(page); ClearPageUptodate(page); delete_from_page_cache(page); } -static void truncate_hugepages(struct inode *inode, loff_t lstart) + +/* + * remove_inode_hugepages handles two distinct cases: truncation and hole + * punch. There are subtle differences in operation for each case. + + * truncation is indicated by end of range being LLONG_MAX + * In this case, we first scan the range and release found pages. + * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv + * maps and global counts. + * hole punch is indicated if end is not LLONG_MAX + * In the hole punch case we scan the range and release found pages. + * Only when releasing a page is the associated region/reserv map + * deleted. The region/reserv map for ranges without associated + * pages are not modified. + * Note: If the passed end of range value is beyond the end of file, but + * not LLONG_MAX this routine still performs a hole punch operation. + */ +static void remove_inode_hugepages(struct inode *inode, loff_t lstart, + loff_t lend) { struct hstate *h = hstate_inode(inode); struct address_space *mapping = &inode->i_data; const pgoff_t start = lstart >> huge_page_shift(h); + const pgoff_t end = lend >> huge_page_shift(h); + struct vm_area_struct pseudo_vma; struct pagevec pvec; pgoff_t next; int i, freed = 0; + long lookup_nr = PAGEVEC_SIZE; + bool truncate_op = (lend == LLONG_MAX); + memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); + pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pagevec_init(&pvec, 0); next = start; - while (1) { - if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + while (next < end) { + /* + * Make sure to never grab more pages that we + * might possibly need. + */ + if (end - next < lookup_nr) + lookup_nr = end - next; + + /* + * This pagevec_lookup() may return pages past 'end', + * so we must check for page->index > end. + */ + if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) { if (next == start) break; next = start; @@ -321,26 +380,69 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart) for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; + u32 hash; + + hash = hugetlb_fault_mutex_hash(h, current->mm, + &pseudo_vma, + mapping, next, 0); + mutex_lock(&hugetlb_fault_mutex_table[hash]); lock_page(page); + if (page->index >= end) { + unlock_page(page); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + next = end; /* we are done */ + break; + } + + /* + * If page is mapped, it was faulted in after being + * unmapped. Do nothing in this race case. In the + * normal case page is not mapped. + */ + if (!page_mapped(page)) { + bool rsv_on_error = !PagePrivate(page); + /* + * We must free the huge page and remove + * from page cache (remove_huge_page) BEFORE + * removing the region/reserve map + * (hugetlb_unreserve_pages). In rare out + * of memory conditions, removal of the + * region/reserve map could fail. Before + * free'ing the page, note PagePrivate which + * is used in case of error. + */ + remove_huge_page(page); + freed++; + if (!truncate_op) { + if (unlikely(hugetlb_unreserve_pages( + inode, next, + next + 1, 1))) + hugetlb_fix_reserve_counts( + inode, rsv_on_error); + } + } + if (page->index > next) next = page->index; + ++next; - truncate_huge_page(page); unlock_page(page); - freed++; + + mutex_unlock(&hugetlb_fault_mutex_table[hash]); } huge_pagevec_release(&pvec); } - BUG_ON(!lstart && mapping->nrpages); - hugetlb_unreserve_pages(inode, start, freed); + + if (truncate_op) + (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed); } static void hugetlbfs_evict_inode(struct inode *inode) { struct resv_map *resv_map; - truncate_hugepages(inode, 0); + remove_inode_hugepages(inode, 0, LLONG_MAX); resv_map = (struct resv_map *)inode->i_mapping->private_data; /* root inode doesn't have the resv_map, so we should check it */ if (resv_map) @@ -349,11 +451,15 @@ static void hugetlbfs_evict_inode(struct inode *inode) } static inline void -hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff) +hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end) { struct vm_area_struct *vma; - vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) { + /* + * end == 0 indicates that the entire range after + * start should be unmapped. + */ + vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { unsigned long v_offset; /* @@ -362,13 +468,20 @@ hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff) * which overlap the truncated area starting at pgoff, * and no vma on a 32-bit arch can span beyond the 4GB. */ - if (vma->vm_pgoff < pgoff) - v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT; + if (vma->vm_pgoff < start) + v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT; else v_offset = 0; - unmap_hugepage_range(vma, vma->vm_start + v_offset, - vma->vm_end, NULL); + if (end) { + end = ((end - start) << PAGE_SHIFT) + + vma->vm_start + v_offset; + if (end > vma->vm_end) + end = vma->vm_end; + } else + end = vma->vm_end; + + unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL); } } @@ -384,12 +497,164 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) i_size_write(inode, offset); i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap)) - hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); + hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); i_mmap_unlock_write(mapping); - truncate_hugepages(inode, offset); + remove_inode_hugepages(inode, offset, LLONG_MAX); return 0; } +static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) +{ + struct hstate *h = hstate_inode(inode); + loff_t hpage_size = huge_page_size(h); + loff_t hole_start, hole_end; + + /* + * For hole punch round up the beginning offset of the hole and + * round down the end. + */ + hole_start = round_up(offset, hpage_size); + hole_end = round_down(offset + len, hpage_size); + + if (hole_end > hole_start) { + struct address_space *mapping = inode->i_mapping; + + mutex_lock(&inode->i_mutex); + i_mmap_lock_write(mapping); + if (!RB_EMPTY_ROOT(&mapping->i_mmap)) + hugetlb_vmdelete_list(&mapping->i_mmap, + hole_start >> PAGE_SHIFT, + hole_end >> PAGE_SHIFT); + i_mmap_unlock_write(mapping); + remove_inode_hugepages(inode, hole_start, hole_end); + mutex_unlock(&inode->i_mutex); + } + + return 0; +} + +static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + struct address_space *mapping = inode->i_mapping; + struct hstate *h = hstate_inode(inode); + struct vm_area_struct pseudo_vma; + struct mm_struct *mm = current->mm; + loff_t hpage_size = huge_page_size(h); + unsigned long hpage_shift = huge_page_shift(h); + pgoff_t start, index, end; + int error; + u32 hash; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + if (mode & FALLOC_FL_PUNCH_HOLE) + return hugetlbfs_punch_hole(inode, offset, len); + + /* + * Default preallocate case. + * For this range, start is rounded down and end is rounded up + * as well as being converted to page offsets. + */ + start = offset >> hpage_shift; + end = (offset + len + hpage_size - 1) >> hpage_shift; + + mutex_lock(&inode->i_mutex); + + /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ + error = inode_newsize_ok(inode, offset + len); + if (error) + goto out; + + /* + * Initialize a pseudo vma as this is required by the huge page + * allocation routines. If NUMA is configured, use page index + * as input to create an allocation policy. + */ + memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); + pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); + pseudo_vma.vm_file = file; + + for (index = start; index < end; index++) { + /* + * This is supposed to be the vaddr where the page is being + * faulted in, but we have no vaddr here. + */ + struct page *page; + unsigned long addr; + int avoid_reserve = 0; + + cond_resched(); + + /* + * fallocate(2) manpage permits EINTR; we may have been + * interrupted because we are using up too much memory. + */ + if (signal_pending(current)) { + error = -EINTR; + break; + } + + /* Set numa allocation policy based on index */ + hugetlb_set_vma_policy(&pseudo_vma, inode, index); + + /* addr is the offset within the file (zero based) */ + addr = index * hpage_size; + + /* mutex taken here, fault path and hole punch */ + hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, + index, addr); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + + /* See if already present in mapping to avoid alloc/free */ + page = find_get_page(mapping, index); + if (page) { + put_page(page); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + hugetlb_drop_vma_policy(&pseudo_vma); + continue; + } + + /* Allocate page and add to page cache */ + page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve); + hugetlb_drop_vma_policy(&pseudo_vma); + if (IS_ERR(page)) { + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + error = PTR_ERR(page); + goto out; + } + clear_huge_page(page, addr, pages_per_huge_page(h)); + __SetPageUptodate(page); + error = huge_add_to_page_cache(page, mapping, index); + if (unlikely(error)) { + put_page(page); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + goto out; + } + + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + + /* + * page_put due to reference from alloc_huge_page() + * unlock_page because locked by add_to_page_cache() + */ + put_page(page); + unlock_page(page); + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) + i_size_write(inode, offset + len); + inode->i_ctime = CURRENT_TIME; + spin_lock(&inode->i_lock); + inode->i_private = NULL; + spin_unlock(&inode->i_lock); +out: + mutex_unlock(&inode->i_mutex); + return error; +} + static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -701,7 +966,8 @@ const struct file_operations hugetlbfs_file_operations = { .mmap = hugetlbfs_file_mmap, .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, - .llseek = default_llseek, + .llseek = default_llseek, + .fallocate = hugetlbfs_fallocate, }; static const struct inode_operations hugetlbfs_dir_inode_operations = { @@ -1010,6 +1276,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size, inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0); if (!inode) goto out_dentry; + if (creat_flags == HUGETLB_SHMFS_INODE) + inode->i_flags |= S_PRIVATE; file = ERR_PTR(-ENOMEM); if (hugetlb_reserve_pages(inode, 0, diff --git a/fs/inode.c b/fs/inode.c index 069721f0cc0e..78a17b8859e1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -28,16 +28,16 @@ * inode->i_state, inode->i_hash, __iget() * Inode LRU list locks protect: * inode->i_sb->s_inode_lru, inode->i_lru - * inode_sb_list_lock protects: - * sb->s_inodes, inode->i_sb_list + * inode->i_sb->s_inode_list_lock protects: + * inode->i_sb->s_inodes, inode->i_sb_list * bdi->wb.list_lock protects: - * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list + * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list * inode_hash_lock protects: * inode_hashtable, inode->i_hash * * Lock ordering: * - * inode_sb_list_lock + * inode->i_sb->s_inode_list_lock * inode->i_lock * Inode LRU list locks * @@ -45,7 +45,7 @@ * inode->i_lock * * inode_hash_lock - * inode_sb_list_lock + * inode->i_sb->s_inode_list_lock * inode->i_lock * * iunique_lock @@ -57,8 +57,6 @@ static unsigned int i_hash_shift __read_mostly; static struct hlist_head *inode_hashtable __read_mostly; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); -__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); - /* * Empty aops. Can be used for the cases where the user does not * define any of the address_space operations. @@ -359,7 +357,7 @@ void inode_init_once(struct inode *inode) memset(inode, 0, sizeof(*inode)); INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_devices); - INIT_LIST_HEAD(&inode->i_wb_list); + INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_lru); address_space_init_once(&inode->i_data); i_size_ordered_init(inode); @@ -426,18 +424,18 @@ static void inode_lru_list_del(struct inode *inode) */ void inode_sb_list_add(struct inode *inode) { - spin_lock(&inode_sb_list_lock); + spin_lock(&inode->i_sb->s_inode_list_lock); list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&inode->i_sb->s_inode_list_lock); } EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { if (!list_empty(&inode->i_sb_list)) { - spin_lock(&inode_sb_list_lock); + spin_lock(&inode->i_sb->s_inode_list_lock); list_del_init(&inode->i_sb_list); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&inode->i_sb->s_inode_list_lock); } } @@ -527,8 +525,8 @@ static void evict(struct inode *inode) BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(!list_empty(&inode->i_lru)); - if (!list_empty(&inode->i_wb_list)) - inode_wb_list_del(inode); + if (!list_empty(&inode->i_io_list)) + inode_io_list_del(inode); inode_sb_list_del(inode); @@ -577,6 +575,7 @@ static void dispose_list(struct list_head *head) list_del_init(&inode->i_lru); evict(inode); + cond_resched(); } } @@ -594,7 +593,8 @@ void evict_inodes(struct super_block *sb) struct inode *inode, *next; LIST_HEAD(dispose); - spin_lock(&inode_sb_list_lock); +again: + spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count)) continue; @@ -609,8 +609,20 @@ void evict_inodes(struct super_block *sb) inode_lru_list_del(inode); spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); + + /* + * We can have a ton of inodes to evict at unmount time given + * enough memory, check to see if we need to go to sleep for a + * bit so we don't livelock. + */ + if (need_resched()) { + spin_unlock(&sb->s_inode_list_lock); + cond_resched(); + dispose_list(&dispose); + goto again; + } } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); } @@ -631,7 +643,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) struct inode *inode, *next; LIST_HEAD(dispose); - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { @@ -654,7 +666,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); @@ -841,7 +853,11 @@ unsigned int get_next_ino(void) } #endif - *p = ++res; + res++; + /* get_next_ino should not provide a 0 inode number */ + if (unlikely(!res)) + res++; + *p = res; put_cpu_var(last_ino); return res; } @@ -886,7 +902,7 @@ struct inode *new_inode(struct super_block *sb) { struct inode *inode; - spin_lock_prefetch(&inode_sb_list_lock); + spin_lock_prefetch(&sb->s_inode_list_lock); inode = new_inode_pseudo(sb); if (inode) @@ -1674,7 +1690,31 @@ int should_remove_suid(struct dentry *dentry) } EXPORT_SYMBOL(should_remove_suid); -static int __remove_suid(struct dentry *dentry, int kill) +/* + * Return mask of changes for notify_change() that need to be done as a + * response to write or truncate. Return 0 if nothing has to be changed. + * Negative value on error (change should be denied). + */ +int dentry_needs_remove_privs(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + int mask = 0; + int ret; + + if (IS_NOSEC(inode)) + return 0; + + mask = should_remove_suid(dentry); + ret = security_inode_need_killpriv(dentry); + if (ret < 0) + return ret; + if (ret) + mask |= ATTR_KILL_PRIV; + return mask; +} +EXPORT_SYMBOL(dentry_needs_remove_privs); + +static int __remove_privs(struct dentry *dentry, int kill) { struct iattr newattrs; @@ -1686,33 +1726,32 @@ static int __remove_suid(struct dentry *dentry, int kill) return notify_change(dentry, &newattrs, NULL); } -int file_remove_suid(struct file *file) +/* + * Remove special file priviledges (suid, capabilities) when file is written + * to or truncated. + */ +int file_remove_privs(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct inode *inode = d_inode(dentry); - int killsuid; - int killpriv; + int kill; int error = 0; /* Fast path for nothing security related */ if (IS_NOSEC(inode)) return 0; - killsuid = should_remove_suid(dentry); - killpriv = security_inode_need_killpriv(dentry); - - if (killpriv < 0) - return killpriv; - if (killpriv) - error = security_inode_killpriv(dentry); - if (!error && killsuid) - error = __remove_suid(dentry, killsuid); - if (!error && (inode->i_sb->s_flags & MS_NOSEC)) - inode->i_flags |= S_NOSEC; + kill = file_needs_remove_privs(file); + if (kill < 0) + return kill; + if (kill) + error = __remove_privs(dentry, kill); + if (!error) + inode_has_no_xattr(inode); return error; } -EXPORT_SYMBOL(file_remove_suid); +EXPORT_SYMBOL(file_remove_privs); /** * file_update_time - update mtime and ctime time @@ -1967,9 +2006,8 @@ EXPORT_SYMBOL(inode_dio_wait); * inode is being instantiated). The reason for the cmpxchg() loop * --- which wouldn't be necessary if all code paths which modify * i_flags actually followed this rule, is that there is at least one - * code path which doesn't today --- for example, - * __generic_file_aio_write() calls file_remove_suid() without holding - * i_mutex --- so we use cmpxchg() out of an abundance of caution. + * code path which doesn't today so we use cmpxchg() out of an abundance + * of caution. * * In the long run, i_mutex is overkill, and we should probably look * at using the i_lock spinlock to protect i_flags, and then make sure diff --git a/fs/internal.h b/fs/internal.h index 01dce1d1476b..71859c4d0b41 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -107,18 +107,18 @@ extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, extern long do_handle_open(int mountdirfd, struct file_handle __user *ufh, int open_flag); extern int open_check_o_direct(struct file *f); +extern int vfs_open(const struct path *, struct file *, const struct cred *); /* * inode.c */ -extern spinlock_t inode_sb_list_lock; extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); extern void inode_add_lru(struct inode *inode); /* * fs-writeback.c */ -extern void inode_wb_list_del(struct inode *inode); +extern void inode_io_list_del(struct inode *inode); extern long get_nr_dirty_inodes(void); extern void evict_inodes(struct super_block *); diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig deleted file mode 100644 index 4e28beeed157..000000000000 --- a/fs/jbd/Kconfig +++ /dev/null @@ -1,30 +0,0 @@ -config JBD - tristate - help - This is a generic journalling layer for block devices. It is - currently used by the ext3 file system, but it could also be - used to add journal support to other file systems or block - devices such as RAID or LVM. - - If you are using the ext3 file system, you need to say Y here. - If you are not using ext3 then you will probably want to say N. - - To compile this device as a module, choose M here: the module will be - called jbd. If you are compiling ext3 into the kernel, you - cannot compile this code as a module. - -config JBD_DEBUG - bool "JBD (ext3) debugging support" - depends on JBD && DEBUG_FS - help - If you are using the ext3 journaled file system (or potentially any - other file system/device using JBD), this option allows you to - enable debugging output while the system is running, in order to - help track down any problems you are having. By default the - debugging output will be turned off. - - If you select Y here, then you will be able to turn on debugging - with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a - number between 1 and 5, the higher the number, the more debugging - output is generated. To turn debugging off again, do - "echo 0 > /sys/kernel/debug/jbd/jbd-debug". diff --git a/fs/jbd/Makefile b/fs/jbd/Makefile deleted file mode 100644 index 54aca4868a36..000000000000 --- a/fs/jbd/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# -# Makefile for the linux journaling routines. -# - -obj-$(CONFIG_JBD) += jbd.o - -jbd-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c deleted file mode 100644 index 08c03044abdd..000000000000 --- a/fs/jbd/checkpoint.c +++ /dev/null @@ -1,782 +0,0 @@ -/* - * linux/fs/jbd/checkpoint.c - * - * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 - * - * Copyright 1999 Red Hat Software --- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * Checkpoint routines for the generic filesystem journaling code. - * Part of the ext2fs journaling system. - * - * Checkpointing is the process of ensuring that a section of the log is - * committed fully to disk, so that that portion of the log can be - * reused. - */ - -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/blkdev.h> -#include <trace/events/jbd.h> - -/* - * Unlink a buffer from a transaction checkpoint list. - * - * Called with j_list_lock held. - */ -static inline void __buffer_unlink_first(struct journal_head *jh) -{ - transaction_t *transaction = jh->b_cp_transaction; - - jh->b_cpnext->b_cpprev = jh->b_cpprev; - jh->b_cpprev->b_cpnext = jh->b_cpnext; - if (transaction->t_checkpoint_list == jh) { - transaction->t_checkpoint_list = jh->b_cpnext; - if (transaction->t_checkpoint_list == jh) - transaction->t_checkpoint_list = NULL; - } -} - -/* - * Unlink a buffer from a transaction checkpoint(io) list. - * - * Called with j_list_lock held. - */ -static inline void __buffer_unlink(struct journal_head *jh) -{ - transaction_t *transaction = jh->b_cp_transaction; - - __buffer_unlink_first(jh); - if (transaction->t_checkpoint_io_list == jh) { - transaction->t_checkpoint_io_list = jh->b_cpnext; - if (transaction->t_checkpoint_io_list == jh) - transaction->t_checkpoint_io_list = NULL; - } -} - -/* - * Move a buffer from the checkpoint list to the checkpoint io list - * - * Called with j_list_lock held - */ -static inline void __buffer_relink_io(struct journal_head *jh) -{ - transaction_t *transaction = jh->b_cp_transaction; - - __buffer_unlink_first(jh); - - if (!transaction->t_checkpoint_io_list) { - jh->b_cpnext = jh->b_cpprev = jh; - } else { - jh->b_cpnext = transaction->t_checkpoint_io_list; - jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; - jh->b_cpprev->b_cpnext = jh; - jh->b_cpnext->b_cpprev = jh; - } - transaction->t_checkpoint_io_list = jh; -} - -/* - * Try to release a checkpointed buffer from its transaction. - * Returns 1 if we released it and 2 if we also released the - * whole transaction. - * - * Requires j_list_lock - * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it - */ -static int __try_to_free_cp_buf(struct journal_head *jh) -{ - int ret = 0; - struct buffer_head *bh = jh2bh(jh); - - if (jh->b_jlist == BJ_None && !buffer_locked(bh) && - !buffer_dirty(bh) && !buffer_write_io_error(bh)) { - /* - * Get our reference so that bh cannot be freed before - * we unlock it - */ - get_bh(bh); - JBUFFER_TRACE(jh, "remove from checkpoint list"); - ret = __journal_remove_checkpoint(jh) + 1; - jbd_unlock_bh_state(bh); - BUFFER_TRACE(bh, "release"); - __brelse(bh); - } else { - jbd_unlock_bh_state(bh); - } - return ret; -} - -/* - * __log_wait_for_space: wait until there is space in the journal. - * - * Called under j-state_lock *only*. It will be unlocked if we have to wait - * for a checkpoint to free up some space in the log. - */ -void __log_wait_for_space(journal_t *journal) -{ - int nblocks, space_left; - assert_spin_locked(&journal->j_state_lock); - - nblocks = jbd_space_needed(journal); - while (__log_space_left(journal) < nblocks) { - if (journal->j_flags & JFS_ABORT) - return; - spin_unlock(&journal->j_state_lock); - mutex_lock(&journal->j_checkpoint_mutex); - - /* - * Test again, another process may have checkpointed while we - * were waiting for the checkpoint lock. If there are no - * transactions ready to be checkpointed, try to recover - * journal space by calling cleanup_journal_tail(), and if - * that doesn't work, by waiting for the currently committing - * transaction to complete. If there is absolutely no way - * to make progress, this is either a BUG or corrupted - * filesystem, so abort the journal and leave a stack - * trace for forensic evidence. - */ - spin_lock(&journal->j_state_lock); - spin_lock(&journal->j_list_lock); - nblocks = jbd_space_needed(journal); - space_left = __log_space_left(journal); - if (space_left < nblocks) { - int chkpt = journal->j_checkpoint_transactions != NULL; - tid_t tid = 0; - - if (journal->j_committing_transaction) - tid = journal->j_committing_transaction->t_tid; - spin_unlock(&journal->j_list_lock); - spin_unlock(&journal->j_state_lock); - if (chkpt) { - log_do_checkpoint(journal); - } else if (cleanup_journal_tail(journal) == 0) { - /* We were able to recover space; yay! */ - ; - } else if (tid) { - log_wait_commit(journal, tid); - } else { - printk(KERN_ERR "%s: needed %d blocks and " - "only had %d space available\n", - __func__, nblocks, space_left); - printk(KERN_ERR "%s: no way to get more " - "journal space\n", __func__); - WARN_ON(1); - journal_abort(journal, 0); - } - spin_lock(&journal->j_state_lock); - } else { - spin_unlock(&journal->j_list_lock); - } - mutex_unlock(&journal->j_checkpoint_mutex); - } -} - -/* - * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. - * The caller must restart a list walk. Wait for someone else to run - * jbd_unlock_bh_state(). - */ -static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) - __releases(journal->j_list_lock) -{ - get_bh(bh); - spin_unlock(&journal->j_list_lock); - jbd_lock_bh_state(bh); - jbd_unlock_bh_state(bh); - put_bh(bh); -} - -/* - * Clean up transaction's list of buffers submitted for io. - * We wait for any pending IO to complete and remove any clean - * buffers. Note that we take the buffers in the opposite ordering - * from the one in which they were submitted for IO. - * - * Return 0 on success, and return <0 if some buffers have failed - * to be written out. - * - * Called with j_list_lock held. - */ -static int __wait_cp_io(journal_t *journal, transaction_t *transaction) -{ - struct journal_head *jh; - struct buffer_head *bh; - tid_t this_tid; - int released = 0; - int ret = 0; - - this_tid = transaction->t_tid; -restart: - /* Did somebody clean up the transaction in the meanwhile? */ - if (journal->j_checkpoint_transactions != transaction || - transaction->t_tid != this_tid) - return ret; - while (!released && transaction->t_checkpoint_io_list) { - jh = transaction->t_checkpoint_io_list; - bh = jh2bh(jh); - if (!jbd_trylock_bh_state(bh)) { - jbd_sync_bh(journal, bh); - spin_lock(&journal->j_list_lock); - goto restart; - } - get_bh(bh); - if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - wait_on_buffer(bh); - /* the journal_head may have gone by now */ - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - spin_lock(&journal->j_list_lock); - goto restart; - } - if (unlikely(buffer_write_io_error(bh))) - ret = -EIO; - - /* - * Now in whatever state the buffer currently is, we know that - * it has been written out and so we can drop it from the list - */ - released = __journal_remove_checkpoint(jh); - jbd_unlock_bh_state(bh); - __brelse(bh); - } - - return ret; -} - -#define NR_BATCH 64 - -static void -__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) -{ - int i; - struct blk_plug plug; - - blk_start_plug(&plug); - for (i = 0; i < *batch_count; i++) - write_dirty_buffer(bhs[i], WRITE_SYNC); - blk_finish_plug(&plug); - - for (i = 0; i < *batch_count; i++) { - struct buffer_head *bh = bhs[i]; - clear_buffer_jwrite(bh); - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - } - *batch_count = 0; -} - -/* - * Try to flush one buffer from the checkpoint list to disk. - * - * Return 1 if something happened which requires us to abort the current - * scan of the checkpoint list. Return <0 if the buffer has failed to - * be written out. - * - * Called with j_list_lock held and drops it if 1 is returned - * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it - */ -static int __process_buffer(journal_t *journal, struct journal_head *jh, - struct buffer_head **bhs, int *batch_count) -{ - struct buffer_head *bh = jh2bh(jh); - int ret = 0; - - if (buffer_locked(bh)) { - get_bh(bh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - wait_on_buffer(bh); - /* the journal_head may have gone by now */ - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - ret = 1; - } else if (jh->b_transaction != NULL) { - transaction_t *t = jh->b_transaction; - tid_t tid = t->t_tid; - - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - log_start_commit(journal, tid); - log_wait_commit(journal, tid); - ret = 1; - } else if (!buffer_dirty(bh)) { - ret = 1; - if (unlikely(buffer_write_io_error(bh))) - ret = -EIO; - get_bh(bh); - J_ASSERT_JH(jh, !buffer_jbddirty(bh)); - BUFFER_TRACE(bh, "remove from checkpoint"); - __journal_remove_checkpoint(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - __brelse(bh); - } else { - /* - * Important: we are about to write the buffer, and - * possibly block, while still holding the journal lock. - * We cannot afford to let the transaction logic start - * messing around with this buffer before we write it to - * disk, as that would break recoverability. - */ - BUFFER_TRACE(bh, "queue"); - get_bh(bh); - J_ASSERT_BH(bh, !buffer_jwrite(bh)); - set_buffer_jwrite(bh); - bhs[*batch_count] = bh; - __buffer_relink_io(jh); - jbd_unlock_bh_state(bh); - (*batch_count)++; - if (*batch_count == NR_BATCH) { - spin_unlock(&journal->j_list_lock); - __flush_batch(journal, bhs, batch_count); - ret = 1; - } - } - return ret; -} - -/* - * Perform an actual checkpoint. We take the first transaction on the - * list of transactions to be checkpointed and send all its buffers - * to disk. We submit larger chunks of data at once. - * - * The journal should be locked before calling this function. - * Called with j_checkpoint_mutex held. - */ -int log_do_checkpoint(journal_t *journal) -{ - transaction_t *transaction; - tid_t this_tid; - int result; - - jbd_debug(1, "Start checkpoint\n"); - - /* - * First thing: if there are any transactions in the log which - * don't need checkpointing, just eliminate them from the - * journal straight away. - */ - result = cleanup_journal_tail(journal); - trace_jbd_checkpoint(journal, result); - jbd_debug(1, "cleanup_journal_tail returned %d\n", result); - if (result <= 0) - return result; - - /* - * OK, we need to start writing disk blocks. Take one transaction - * and write it. - */ - result = 0; - spin_lock(&journal->j_list_lock); - if (!journal->j_checkpoint_transactions) - goto out; - transaction = journal->j_checkpoint_transactions; - this_tid = transaction->t_tid; -restart: - /* - * If someone cleaned up this transaction while we slept, we're - * done (maybe it's a new transaction, but it fell at the same - * address). - */ - if (journal->j_checkpoint_transactions == transaction && - transaction->t_tid == this_tid) { - int batch_count = 0; - struct buffer_head *bhs[NR_BATCH]; - struct journal_head *jh; - int retry = 0, err; - - while (!retry && transaction->t_checkpoint_list) { - struct buffer_head *bh; - - jh = transaction->t_checkpoint_list; - bh = jh2bh(jh); - if (!jbd_trylock_bh_state(bh)) { - jbd_sync_bh(journal, bh); - retry = 1; - break; - } - retry = __process_buffer(journal, jh, bhs,&batch_count); - if (retry < 0 && !result) - result = retry; - if (!retry && (need_resched() || - spin_needbreak(&journal->j_list_lock))) { - spin_unlock(&journal->j_list_lock); - retry = 1; - break; - } - } - - if (batch_count) { - if (!retry) { - spin_unlock(&journal->j_list_lock); - retry = 1; - } - __flush_batch(journal, bhs, &batch_count); - } - - if (retry) { - spin_lock(&journal->j_list_lock); - goto restart; - } - /* - * Now we have cleaned up the first transaction's checkpoint - * list. Let's clean up the second one - */ - err = __wait_cp_io(journal, transaction); - if (!result) - result = err; - } -out: - spin_unlock(&journal->j_list_lock); - if (result < 0) - journal_abort(journal, result); - else - result = cleanup_journal_tail(journal); - - return (result < 0) ? result : 0; -} - -/* - * Check the list of checkpoint transactions for the journal to see if - * we have already got rid of any since the last update of the log tail - * in the journal superblock. If so, we can instantly roll the - * superblock forward to remove those transactions from the log. - * - * Return <0 on error, 0 on success, 1 if there was nothing to clean up. - * - * This is the only part of the journaling code which really needs to be - * aware of transaction aborts. Checkpointing involves writing to the - * main filesystem area rather than to the journal, so it can proceed - * even in abort state, but we must not update the super block if - * checkpointing may have failed. Otherwise, we would lose some metadata - * buffers which should be written-back to the filesystem. - */ - -int cleanup_journal_tail(journal_t *journal) -{ - transaction_t * transaction; - tid_t first_tid; - unsigned int blocknr, freed; - - if (is_journal_aborted(journal)) - return 1; - - /* - * OK, work out the oldest transaction remaining in the log, and - * the log block it starts at. - * - * If the log is now empty, we need to work out which is the - * next transaction ID we will write, and where it will - * start. - */ - spin_lock(&journal->j_state_lock); - spin_lock(&journal->j_list_lock); - transaction = journal->j_checkpoint_transactions; - if (transaction) { - first_tid = transaction->t_tid; - blocknr = transaction->t_log_start; - } else if ((transaction = journal->j_committing_transaction) != NULL) { - first_tid = transaction->t_tid; - blocknr = transaction->t_log_start; - } else if ((transaction = journal->j_running_transaction) != NULL) { - first_tid = transaction->t_tid; - blocknr = journal->j_head; - } else { - first_tid = journal->j_transaction_sequence; - blocknr = journal->j_head; - } - spin_unlock(&journal->j_list_lock); - J_ASSERT(blocknr != 0); - - /* If the oldest pinned transaction is at the tail of the log - already then there's not much we can do right now. */ - if (journal->j_tail_sequence == first_tid) { - spin_unlock(&journal->j_state_lock); - return 1; - } - spin_unlock(&journal->j_state_lock); - - /* - * We need to make sure that any blocks that were recently written out - * --- perhaps by log_do_checkpoint() --- are flushed out before we - * drop the transactions from the journal. Similarly we need to be sure - * superblock makes it to disk before next transaction starts reusing - * freed space (otherwise we could replay some blocks of the new - * transaction thinking they belong to the old one). So we use - * WRITE_FLUSH_FUA. It's unlikely this will be necessary, especially - * with an appropriately sized journal, but we need this to guarantee - * correctness. Fortunately cleanup_journal_tail() doesn't get called - * all that often. - */ - journal_update_sb_log_tail(journal, first_tid, blocknr, - WRITE_FLUSH_FUA); - - spin_lock(&journal->j_state_lock); - /* OK, update the superblock to recover the freed space. - * Physical blocks come first: have we wrapped beyond the end of - * the log? */ - freed = blocknr - journal->j_tail; - if (blocknr < journal->j_tail) - freed = freed + journal->j_last - journal->j_first; - - trace_jbd_cleanup_journal_tail(journal, first_tid, blocknr, freed); - jbd_debug(1, - "Cleaning journal tail from %d to %d (offset %u), " - "freeing %u\n", - journal->j_tail_sequence, first_tid, blocknr, freed); - - journal->j_free += freed; - journal->j_tail_sequence = first_tid; - journal->j_tail = blocknr; - spin_unlock(&journal->j_state_lock); - return 0; -} - - -/* Checkpoint list management */ - -/* - * journal_clean_one_cp_list - * - * Find all the written-back checkpoint buffers in the given list and release - * them. - * - * Called with j_list_lock held. - * Returns number of buffers reaped (for debug) - */ - -static int journal_clean_one_cp_list(struct journal_head *jh, int *released) -{ - struct journal_head *last_jh; - struct journal_head *next_jh = jh; - int ret, freed = 0; - - *released = 0; - if (!jh) - return 0; - - last_jh = jh->b_cpprev; - do { - jh = next_jh; - next_jh = jh->b_cpnext; - /* Use trylock because of the ranking */ - if (jbd_trylock_bh_state(jh2bh(jh))) { - ret = __try_to_free_cp_buf(jh); - if (ret) { - freed++; - if (ret == 2) { - *released = 1; - return freed; - } - } - } - /* - * This function only frees up some memory - * if possible so we dont have an obligation - * to finish processing. Bail out if preemption - * requested: - */ - if (need_resched()) - return freed; - } while (jh != last_jh); - - return freed; -} - -/* - * journal_clean_checkpoint_list - * - * Find all the written-back checkpoint buffers in the journal and release them. - * - * Called with the journal locked. - * Called with j_list_lock held. - * Returns number of buffers reaped (for debug) - */ - -int __journal_clean_checkpoint_list(journal_t *journal) -{ - transaction_t *transaction, *last_transaction, *next_transaction; - int ret = 0; - int released; - - transaction = journal->j_checkpoint_transactions; - if (!transaction) - goto out; - - last_transaction = transaction->t_cpprev; - next_transaction = transaction; - do { - transaction = next_transaction; - next_transaction = transaction->t_cpnext; - ret += journal_clean_one_cp_list(transaction-> - t_checkpoint_list, &released); - /* - * This function only frees up some memory if possible so we - * dont have an obligation to finish processing. Bail out if - * preemption requested: - */ - if (need_resched()) - goto out; - if (released) - continue; - /* - * It is essential that we are as careful as in the case of - * t_checkpoint_list with removing the buffer from the list as - * we can possibly see not yet submitted buffers on io_list - */ - ret += journal_clean_one_cp_list(transaction-> - t_checkpoint_io_list, &released); - if (need_resched()) - goto out; - } while (transaction != last_transaction); -out: - return ret; -} - -/* - * journal_remove_checkpoint: called after a buffer has been committed - * to disk (either by being write-back flushed to disk, or being - * committed to the log). - * - * We cannot safely clean a transaction out of the log until all of the - * buffer updates committed in that transaction have safely been stored - * elsewhere on disk. To achieve this, all of the buffers in a - * transaction need to be maintained on the transaction's checkpoint - * lists until they have been rewritten, at which point this function is - * called to remove the buffer from the existing transaction's - * checkpoint lists. - * - * The function returns 1 if it frees the transaction, 0 otherwise. - * The function can free jh and bh. - * - * This function is called with j_list_lock held. - * This function is called with jbd_lock_bh_state(jh2bh(jh)) - */ - -int __journal_remove_checkpoint(struct journal_head *jh) -{ - transaction_t *transaction; - journal_t *journal; - int ret = 0; - - JBUFFER_TRACE(jh, "entry"); - - if ((transaction = jh->b_cp_transaction) == NULL) { - JBUFFER_TRACE(jh, "not on transaction"); - goto out; - } - journal = transaction->t_journal; - - JBUFFER_TRACE(jh, "removing from transaction"); - __buffer_unlink(jh); - jh->b_cp_transaction = NULL; - journal_put_journal_head(jh); - - if (transaction->t_checkpoint_list != NULL || - transaction->t_checkpoint_io_list != NULL) - goto out; - - /* - * There is one special case to worry about: if we have just pulled the - * buffer off a running or committing transaction's checkpoing list, - * then even if the checkpoint list is empty, the transaction obviously - * cannot be dropped! - * - * The locking here around t_state is a bit sleazy. - * See the comment at the end of journal_commit_transaction(). - */ - if (transaction->t_state != T_FINISHED) - goto out; - - /* OK, that was the last buffer for the transaction: we can now - safely remove this transaction from the log */ - - __journal_drop_transaction(journal, transaction); - - /* Just in case anybody was waiting for more transactions to be - checkpointed... */ - wake_up(&journal->j_wait_logspace); - ret = 1; -out: - return ret; -} - -/* - * journal_insert_checkpoint: put a committed buffer onto a checkpoint - * list so that we know when it is safe to clean the transaction out of - * the log. - * - * Called with the journal locked. - * Called with j_list_lock held. - */ -void __journal_insert_checkpoint(struct journal_head *jh, - transaction_t *transaction) -{ - JBUFFER_TRACE(jh, "entry"); - J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); - J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); - - /* Get reference for checkpointing transaction */ - journal_grab_journal_head(jh2bh(jh)); - jh->b_cp_transaction = transaction; - - if (!transaction->t_checkpoint_list) { - jh->b_cpnext = jh->b_cpprev = jh; - } else { - jh->b_cpnext = transaction->t_checkpoint_list; - jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; - jh->b_cpprev->b_cpnext = jh; - jh->b_cpnext->b_cpprev = jh; - } - transaction->t_checkpoint_list = jh; -} - -/* - * We've finished with this transaction structure: adios... - * - * The transaction must have no links except for the checkpoint by this - * point. - * - * Called with the journal locked. - * Called with j_list_lock held. - */ - -void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) -{ - assert_spin_locked(&journal->j_list_lock); - if (transaction->t_cpnext) { - transaction->t_cpnext->t_cpprev = transaction->t_cpprev; - transaction->t_cpprev->t_cpnext = transaction->t_cpnext; - if (journal->j_checkpoint_transactions == transaction) - journal->j_checkpoint_transactions = - transaction->t_cpnext; - if (journal->j_checkpoint_transactions == transaction) - journal->j_checkpoint_transactions = NULL; - } - - J_ASSERT(transaction->t_state == T_FINISHED); - J_ASSERT(transaction->t_buffers == NULL); - J_ASSERT(transaction->t_sync_datalist == NULL); - J_ASSERT(transaction->t_forget == NULL); - J_ASSERT(transaction->t_iobuf_list == NULL); - J_ASSERT(transaction->t_shadow_list == NULL); - J_ASSERT(transaction->t_log_list == NULL); - J_ASSERT(transaction->t_checkpoint_list == NULL); - J_ASSERT(transaction->t_checkpoint_io_list == NULL); - J_ASSERT(transaction->t_updates == 0); - J_ASSERT(journal->j_committing_transaction != transaction); - J_ASSERT(journal->j_running_transaction != transaction); - - trace_jbd_drop_transaction(journal, transaction); - jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); - kfree(transaction); -} diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c deleted file mode 100644 index bb217dcb41af..000000000000 --- a/fs/jbd/commit.c +++ /dev/null @@ -1,1021 +0,0 @@ -/* - * linux/fs/jbd/commit.c - * - * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 - * - * Copyright 1998 Red Hat corp --- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * Journal commit routines for the generic filesystem journaling code; - * part of the ext2fs journaling system. - */ - -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/errno.h> -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <trace/events/jbd.h> - -/* - * Default IO end handler for temporary BJ_IO buffer_heads. - */ -static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) -{ - BUFFER_TRACE(bh, ""); - if (uptodate) - set_buffer_uptodate(bh); - else - clear_buffer_uptodate(bh); - unlock_buffer(bh); -} - -/* - * When an ext3-ordered file is truncated, it is possible that many pages are - * not successfully freed, because they are attached to a committing transaction. - * After the transaction commits, these pages are left on the LRU, with no - * ->mapping, and with attached buffers. These pages are trivially reclaimable - * by the VM, but their apparent absence upsets the VM accounting, and it makes - * the numbers in /proc/meminfo look odd. - * - * So here, we have a buffer which has just come off the forget list. Look to - * see if we can strip all buffers from the backing page. - * - * Called under journal->j_list_lock. The caller provided us with a ref - * against the buffer, and we drop that here. - */ -static void release_buffer_page(struct buffer_head *bh) -{ - struct page *page; - - if (buffer_dirty(bh)) - goto nope; - if (atomic_read(&bh->b_count) != 1) - goto nope; - page = bh->b_page; - if (!page) - goto nope; - if (page->mapping) - goto nope; - - /* OK, it's a truncated page */ - if (!trylock_page(page)) - goto nope; - - page_cache_get(page); - __brelse(bh); - try_to_free_buffers(page); - unlock_page(page); - page_cache_release(page); - return; - -nope: - __brelse(bh); -} - -/* - * Decrement reference counter for data buffer. If it has been marked - * 'BH_Freed', release it and the page to which it belongs if possible. - */ -static void release_data_buffer(struct buffer_head *bh) -{ - if (buffer_freed(bh)) { - WARN_ON_ONCE(buffer_dirty(bh)); - clear_buffer_freed(bh); - clear_buffer_mapped(bh); - clear_buffer_new(bh); - clear_buffer_req(bh); - bh->b_bdev = NULL; - release_buffer_page(bh); - } else - put_bh(bh); -} - -/* - * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is - * held. For ranking reasons we must trylock. If we lose, schedule away and - * return 0. j_list_lock is dropped in this case. - */ -static int inverted_lock(journal_t *journal, struct buffer_head *bh) -{ - if (!jbd_trylock_bh_state(bh)) { - spin_unlock(&journal->j_list_lock); - schedule(); - return 0; - } - return 1; -} - -/* Done it all: now write the commit record. We should have - * cleaned up our previous buffers by now, so if we are in abort - * mode we can now just skip the rest of the journal write - * entirely. - * - * Returns 1 if the journal needs to be aborted or 0 on success - */ -static int journal_write_commit_record(journal_t *journal, - transaction_t *commit_transaction) -{ - struct journal_head *descriptor; - struct buffer_head *bh; - journal_header_t *header; - int ret; - - if (is_journal_aborted(journal)) - return 0; - - descriptor = journal_get_descriptor_buffer(journal); - if (!descriptor) - return 1; - - bh = jh2bh(descriptor); - - header = (journal_header_t *)(bh->b_data); - header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); - header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); - header->h_sequence = cpu_to_be32(commit_transaction->t_tid); - - JBUFFER_TRACE(descriptor, "write commit block"); - set_buffer_dirty(bh); - - if (journal->j_flags & JFS_BARRIER) - ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA); - else - ret = sync_dirty_buffer(bh); - - put_bh(bh); /* One for getblk() */ - journal_put_journal_head(descriptor); - - return (ret == -EIO); -} - -static void journal_do_submit_data(struct buffer_head **wbuf, int bufs, - int write_op) -{ - int i; - - for (i = 0; i < bufs; i++) { - wbuf[i]->b_end_io = end_buffer_write_sync; - /* - * Here we write back pagecache data that may be mmaped. Since - * we cannot afford to clean the page and set PageWriteback - * here due to lock ordering (page lock ranks above transaction - * start), the data can change while IO is in flight. Tell the - * block layer it should bounce the bio pages if stable data - * during write is required. - * - * We use up our safety reference in submit_bh(). - */ - _submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE); - } -} - -/* - * Submit all the data buffers to disk - */ -static int journal_submit_data_buffers(journal_t *journal, - transaction_t *commit_transaction, - int write_op) -{ - struct journal_head *jh; - struct buffer_head *bh; - int locked; - int bufs = 0; - struct buffer_head **wbuf = journal->j_wbuf; - int err = 0; - - /* - * Whenever we unlock the journal and sleep, things can get added - * onto ->t_sync_datalist, so we have to keep looping back to - * write_out_data until we *know* that the list is empty. - * - * Cleanup any flushed data buffers from the data list. Even in - * abort mode, we want to flush this out as soon as possible. - */ -write_out_data: - cond_resched(); - spin_lock(&journal->j_list_lock); - - while (commit_transaction->t_sync_datalist) { - jh = commit_transaction->t_sync_datalist; - bh = jh2bh(jh); - locked = 0; - - /* Get reference just to make sure buffer does not disappear - * when we are forced to drop various locks */ - get_bh(bh); - /* If the buffer is dirty, we need to submit IO and hence - * we need the buffer lock. We try to lock the buffer without - * blocking. If we fail, we need to drop j_list_lock and do - * blocking lock_buffer(). - */ - if (buffer_dirty(bh)) { - if (!trylock_buffer(bh)) { - BUFFER_TRACE(bh, "needs blocking lock"); - spin_unlock(&journal->j_list_lock); - trace_jbd_do_submit_data(journal, - commit_transaction); - /* Write out all data to prevent deadlocks */ - journal_do_submit_data(wbuf, bufs, write_op); - bufs = 0; - lock_buffer(bh); - spin_lock(&journal->j_list_lock); - } - locked = 1; - } - /* We have to get bh_state lock. Again out of order, sigh. */ - if (!inverted_lock(journal, bh)) { - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - } - /* Someone already cleaned up the buffer? */ - if (!buffer_jbd(bh) || bh2jh(bh) != jh - || jh->b_transaction != commit_transaction - || jh->b_jlist != BJ_SyncData) { - jbd_unlock_bh_state(bh); - if (locked) - unlock_buffer(bh); - BUFFER_TRACE(bh, "already cleaned up"); - release_data_buffer(bh); - continue; - } - if (locked && test_clear_buffer_dirty(bh)) { - BUFFER_TRACE(bh, "needs writeout, adding to array"); - wbuf[bufs++] = bh; - __journal_file_buffer(jh, commit_transaction, - BJ_Locked); - jbd_unlock_bh_state(bh); - if (bufs == journal->j_wbufsize) { - spin_unlock(&journal->j_list_lock); - trace_jbd_do_submit_data(journal, - commit_transaction); - journal_do_submit_data(wbuf, bufs, write_op); - bufs = 0; - goto write_out_data; - } - } else if (!locked && buffer_locked(bh)) { - __journal_file_buffer(jh, commit_transaction, - BJ_Locked); - jbd_unlock_bh_state(bh); - put_bh(bh); - } else { - BUFFER_TRACE(bh, "writeout complete: unfile"); - if (unlikely(!buffer_uptodate(bh))) - err = -EIO; - __journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - if (locked) - unlock_buffer(bh); - release_data_buffer(bh); - } - - if (need_resched() || spin_needbreak(&journal->j_list_lock)) { - spin_unlock(&journal->j_list_lock); - goto write_out_data; - } - } - spin_unlock(&journal->j_list_lock); - trace_jbd_do_submit_data(journal, commit_transaction); - journal_do_submit_data(wbuf, bufs, write_op); - - return err; -} - -/* - * journal_commit_transaction - * - * The primary function for committing a transaction to the log. This - * function is called by the journal thread to begin a complete commit. - */ -void journal_commit_transaction(journal_t *journal) -{ - transaction_t *commit_transaction; - struct journal_head *jh, *new_jh, *descriptor; - struct buffer_head **wbuf = journal->j_wbuf; - int bufs; - int flags; - int err; - unsigned int blocknr; - ktime_t start_time; - u64 commit_time; - char *tagp = NULL; - journal_header_t *header; - journal_block_tag_t *tag = NULL; - int space_left = 0; - int first_tag = 0; - int tag_flag; - int i; - struct blk_plug plug; - int write_op = WRITE; - - /* - * First job: lock down the current transaction and wait for - * all outstanding updates to complete. - */ - - /* Do we need to erase the effects of a prior journal_flush? */ - if (journal->j_flags & JFS_FLUSHED) { - jbd_debug(3, "super block updated\n"); - mutex_lock(&journal->j_checkpoint_mutex); - /* - * We hold j_checkpoint_mutex so tail cannot change under us. - * We don't need any special data guarantees for writing sb - * since journal is empty and it is ok for write to be - * flushed only with transaction commit. - */ - journal_update_sb_log_tail(journal, journal->j_tail_sequence, - journal->j_tail, WRITE_SYNC); - mutex_unlock(&journal->j_checkpoint_mutex); - } else { - jbd_debug(3, "superblock not updated\n"); - } - - J_ASSERT(journal->j_running_transaction != NULL); - J_ASSERT(journal->j_committing_transaction == NULL); - - commit_transaction = journal->j_running_transaction; - - trace_jbd_start_commit(journal, commit_transaction); - jbd_debug(1, "JBD: starting commit of transaction %d\n", - commit_transaction->t_tid); - - spin_lock(&journal->j_state_lock); - J_ASSERT(commit_transaction->t_state == T_RUNNING); - commit_transaction->t_state = T_LOCKED; - - trace_jbd_commit_locking(journal, commit_transaction); - spin_lock(&commit_transaction->t_handle_lock); - while (commit_transaction->t_updates) { - DEFINE_WAIT(wait); - - prepare_to_wait(&journal->j_wait_updates, &wait, - TASK_UNINTERRUPTIBLE); - if (commit_transaction->t_updates) { - spin_unlock(&commit_transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); - schedule(); - spin_lock(&journal->j_state_lock); - spin_lock(&commit_transaction->t_handle_lock); - } - finish_wait(&journal->j_wait_updates, &wait); - } - spin_unlock(&commit_transaction->t_handle_lock); - - J_ASSERT (commit_transaction->t_outstanding_credits <= - journal->j_max_transaction_buffers); - - /* - * First thing we are allowed to do is to discard any remaining - * BJ_Reserved buffers. Note, it is _not_ permissible to assume - * that there are no such buffers: if a large filesystem - * operation like a truncate needs to split itself over multiple - * transactions, then it may try to do a journal_restart() while - * there are still BJ_Reserved buffers outstanding. These must - * be released cleanly from the current transaction. - * - * In this case, the filesystem must still reserve write access - * again before modifying the buffer in the new transaction, but - * we do not require it to remember exactly which old buffers it - * has reserved. This is consistent with the existing behaviour - * that multiple journal_get_write_access() calls to the same - * buffer are perfectly permissible. - */ - while (commit_transaction->t_reserved_list) { - jh = commit_transaction->t_reserved_list; - JBUFFER_TRACE(jh, "reserved, unused: refile"); - /* - * A journal_get_undo_access()+journal_release_buffer() may - * leave undo-committed data. - */ - if (jh->b_committed_data) { - struct buffer_head *bh = jh2bh(jh); - - jbd_lock_bh_state(bh); - jbd_free(jh->b_committed_data, bh->b_size); - jh->b_committed_data = NULL; - jbd_unlock_bh_state(bh); - } - journal_refile_buffer(journal, jh); - } - - /* - * Now try to drop any written-back buffers from the journal's - * checkpoint lists. We do this *before* commit because it potentially - * frees some memory - */ - spin_lock(&journal->j_list_lock); - __journal_clean_checkpoint_list(journal); - spin_unlock(&journal->j_list_lock); - - jbd_debug (3, "JBD: commit phase 1\n"); - - /* - * Clear revoked flag to reflect there is no revoked buffers - * in the next transaction which is going to be started. - */ - journal_clear_buffer_revoked_flags(journal); - - /* - * Switch to a new revoke table. - */ - journal_switch_revoke_table(journal); - - trace_jbd_commit_flushing(journal, commit_transaction); - commit_transaction->t_state = T_FLUSH; - journal->j_committing_transaction = commit_transaction; - journal->j_running_transaction = NULL; - start_time = ktime_get(); - commit_transaction->t_log_start = journal->j_head; - wake_up(&journal->j_wait_transaction_locked); - spin_unlock(&journal->j_state_lock); - - jbd_debug (3, "JBD: commit phase 2\n"); - - if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid)) - write_op = WRITE_SYNC; - - /* - * Now start flushing things to disk, in the order they appear - * on the transaction lists. Data blocks go first. - */ - blk_start_plug(&plug); - err = journal_submit_data_buffers(journal, commit_transaction, - write_op); - blk_finish_plug(&plug); - - /* - * Wait for all previously submitted IO to complete. - */ - spin_lock(&journal->j_list_lock); - while (commit_transaction->t_locked_list) { - struct buffer_head *bh; - - jh = commit_transaction->t_locked_list->b_tprev; - bh = jh2bh(jh); - get_bh(bh); - if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); - wait_on_buffer(bh); - spin_lock(&journal->j_list_lock); - } - if (unlikely(!buffer_uptodate(bh))) { - if (!trylock_page(bh->b_page)) { - spin_unlock(&journal->j_list_lock); - lock_page(bh->b_page); - spin_lock(&journal->j_list_lock); - } - if (bh->b_page->mapping) - set_bit(AS_EIO, &bh->b_page->mapping->flags); - - unlock_page(bh->b_page); - SetPageError(bh->b_page); - err = -EIO; - } - if (!inverted_lock(journal, bh)) { - put_bh(bh); - spin_lock(&journal->j_list_lock); - continue; - } - if (buffer_jbd(bh) && bh2jh(bh) == jh && - jh->b_transaction == commit_transaction && - jh->b_jlist == BJ_Locked) - __journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - release_data_buffer(bh); - cond_resched_lock(&journal->j_list_lock); - } - spin_unlock(&journal->j_list_lock); - - if (err) { - char b[BDEVNAME_SIZE]; - - printk(KERN_WARNING - "JBD: Detected IO errors while flushing file data " - "on %s\n", bdevname(journal->j_fs_dev, b)); - if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR) - journal_abort(journal, err); - err = 0; - } - - blk_start_plug(&plug); - - journal_write_revoke_records(journal, commit_transaction, write_op); - - /* - * If we found any dirty or locked buffers, then we should have - * looped back up to the write_out_data label. If there weren't - * any then journal_clean_data_list should have wiped the list - * clean by now, so check that it is in fact empty. - */ - J_ASSERT (commit_transaction->t_sync_datalist == NULL); - - jbd_debug (3, "JBD: commit phase 3\n"); - - /* - * Way to go: we have now written out all of the data for a - * transaction! Now comes the tricky part: we need to write out - * metadata. Loop over the transaction's entire buffer list: - */ - spin_lock(&journal->j_state_lock); - commit_transaction->t_state = T_COMMIT; - spin_unlock(&journal->j_state_lock); - - trace_jbd_commit_logging(journal, commit_transaction); - J_ASSERT(commit_transaction->t_nr_buffers <= - commit_transaction->t_outstanding_credits); - - descriptor = NULL; - bufs = 0; - while (commit_transaction->t_buffers) { - - /* Find the next buffer to be journaled... */ - - jh = commit_transaction->t_buffers; - - /* If we're in abort mode, we just un-journal the buffer and - release it. */ - - if (is_journal_aborted(journal)) { - clear_buffer_jbddirty(jh2bh(jh)); - JBUFFER_TRACE(jh, "journal is aborting: refile"); - journal_refile_buffer(journal, jh); - /* If that was the last one, we need to clean up - * any descriptor buffers which may have been - * already allocated, even if we are now - * aborting. */ - if (!commit_transaction->t_buffers) - goto start_journal_io; - continue; - } - - /* Make sure we have a descriptor block in which to - record the metadata buffer. */ - - if (!descriptor) { - struct buffer_head *bh; - - J_ASSERT (bufs == 0); - - jbd_debug(4, "JBD: get descriptor\n"); - - descriptor = journal_get_descriptor_buffer(journal); - if (!descriptor) { - journal_abort(journal, -EIO); - continue; - } - - bh = jh2bh(descriptor); - jbd_debug(4, "JBD: got buffer %llu (%p)\n", - (unsigned long long)bh->b_blocknr, bh->b_data); - header = (journal_header_t *)&bh->b_data[0]; - header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); - header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK); - header->h_sequence = cpu_to_be32(commit_transaction->t_tid); - - tagp = &bh->b_data[sizeof(journal_header_t)]; - space_left = bh->b_size - sizeof(journal_header_t); - first_tag = 1; - set_buffer_jwrite(bh); - set_buffer_dirty(bh); - wbuf[bufs++] = bh; - - /* Record it so that we can wait for IO - completion later */ - BUFFER_TRACE(bh, "ph3: file as descriptor"); - journal_file_buffer(descriptor, commit_transaction, - BJ_LogCtl); - } - - /* Where is the buffer to be written? */ - - err = journal_next_log_block(journal, &blocknr); - /* If the block mapping failed, just abandon the buffer - and repeat this loop: we'll fall into the - refile-on-abort condition above. */ - if (err) { - journal_abort(journal, err); - continue; - } - - /* - * start_this_handle() uses t_outstanding_credits to determine - * the free space in the log, but this counter is changed - * by journal_next_log_block() also. - */ - commit_transaction->t_outstanding_credits--; - - /* Bump b_count to prevent truncate from stumbling over - the shadowed buffer! @@@ This can go if we ever get - rid of the BJ_IO/BJ_Shadow pairing of buffers. */ - get_bh(jh2bh(jh)); - - /* Make a temporary IO buffer with which to write it out - (this will requeue both the metadata buffer and the - temporary IO buffer). new_bh goes on BJ_IO*/ - - set_buffer_jwrite(jh2bh(jh)); - /* - * akpm: journal_write_metadata_buffer() sets - * new_bh->b_transaction to commit_transaction. - * We need to clean this up before we release new_bh - * (which is of type BJ_IO) - */ - JBUFFER_TRACE(jh, "ph3: write metadata"); - flags = journal_write_metadata_buffer(commit_transaction, - jh, &new_jh, blocknr); - set_buffer_jwrite(jh2bh(new_jh)); - wbuf[bufs++] = jh2bh(new_jh); - - /* Record the new block's tag in the current descriptor - buffer */ - - tag_flag = 0; - if (flags & 1) - tag_flag |= JFS_FLAG_ESCAPE; - if (!first_tag) - tag_flag |= JFS_FLAG_SAME_UUID; - - tag = (journal_block_tag_t *) tagp; - tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr); - tag->t_flags = cpu_to_be32(tag_flag); - tagp += sizeof(journal_block_tag_t); - space_left -= sizeof(journal_block_tag_t); - - if (first_tag) { - memcpy (tagp, journal->j_uuid, 16); - tagp += 16; - space_left -= 16; - first_tag = 0; - } - - /* If there's no more to do, or if the descriptor is full, - let the IO rip! */ - - if (bufs == journal->j_wbufsize || - commit_transaction->t_buffers == NULL || - space_left < sizeof(journal_block_tag_t) + 16) { - - jbd_debug(4, "JBD: Submit %d IOs\n", bufs); - - /* Write an end-of-descriptor marker before - submitting the IOs. "tag" still points to - the last tag we set up. */ - - tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG); - -start_journal_io: - for (i = 0; i < bufs; i++) { - struct buffer_head *bh = wbuf[i]; - lock_buffer(bh); - clear_buffer_dirty(bh); - set_buffer_uptodate(bh); - bh->b_end_io = journal_end_buffer_io_sync; - /* - * In data=journal mode, here we can end up - * writing pagecache data that might be - * mmapped. Since we can't afford to clean the - * page and set PageWriteback (see the comment - * near the other use of _submit_bh()), the - * data can change while the write is in - * flight. Tell the block layer to bounce the - * bio pages if stable pages are required. - */ - _submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE); - } - cond_resched(); - - /* Force a new descriptor to be generated next - time round the loop. */ - descriptor = NULL; - bufs = 0; - } - } - - blk_finish_plug(&plug); - - /* Lo and behold: we have just managed to send a transaction to - the log. Before we can commit it, wait for the IO so far to - complete. Control buffers being written are on the - transaction's t_log_list queue, and metadata buffers are on - the t_iobuf_list queue. - - Wait for the buffers in reverse order. That way we are - less likely to be woken up until all IOs have completed, and - so we incur less scheduling load. - */ - - jbd_debug(3, "JBD: commit phase 4\n"); - - /* - * akpm: these are BJ_IO, and j_list_lock is not needed. - * See __journal_try_to_free_buffer. - */ -wait_for_iobuf: - while (commit_transaction->t_iobuf_list != NULL) { - struct buffer_head *bh; - - jh = commit_transaction->t_iobuf_list->b_tprev; - bh = jh2bh(jh); - if (buffer_locked(bh)) { - wait_on_buffer(bh); - goto wait_for_iobuf; - } - if (cond_resched()) - goto wait_for_iobuf; - - if (unlikely(!buffer_uptodate(bh))) - err = -EIO; - - clear_buffer_jwrite(bh); - - JBUFFER_TRACE(jh, "ph4: unfile after journal write"); - journal_unfile_buffer(journal, jh); - - /* - * ->t_iobuf_list should contain only dummy buffer_heads - * which were created by journal_write_metadata_buffer(). - */ - BUFFER_TRACE(bh, "dumping temporary bh"); - journal_put_journal_head(jh); - __brelse(bh); - J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); - free_buffer_head(bh); - - /* We also have to unlock and free the corresponding - shadowed buffer */ - jh = commit_transaction->t_shadow_list->b_tprev; - bh = jh2bh(jh); - clear_buffer_jwrite(bh); - J_ASSERT_BH(bh, buffer_jbddirty(bh)); - - /* The metadata is now released for reuse, but we need - to remember it against this transaction so that when - we finally commit, we can do any checkpointing - required. */ - JBUFFER_TRACE(jh, "file as BJ_Forget"); - journal_file_buffer(jh, commit_transaction, BJ_Forget); - /* - * Wake up any transactions which were waiting for this - * IO to complete. The barrier must be here so that changes - * by journal_file_buffer() take effect before wake_up_bit() - * does the waitqueue check. - */ - smp_mb(); - wake_up_bit(&bh->b_state, BH_Unshadow); - JBUFFER_TRACE(jh, "brelse shadowed buffer"); - __brelse(bh); - } - - J_ASSERT (commit_transaction->t_shadow_list == NULL); - - jbd_debug(3, "JBD: commit phase 5\n"); - - /* Here we wait for the revoke record and descriptor record buffers */ - wait_for_ctlbuf: - while (commit_transaction->t_log_list != NULL) { - struct buffer_head *bh; - - jh = commit_transaction->t_log_list->b_tprev; - bh = jh2bh(jh); - if (buffer_locked(bh)) { - wait_on_buffer(bh); - goto wait_for_ctlbuf; - } - if (cond_resched()) - goto wait_for_ctlbuf; - - if (unlikely(!buffer_uptodate(bh))) - err = -EIO; - - BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); - clear_buffer_jwrite(bh); - journal_unfile_buffer(journal, jh); - journal_put_journal_head(jh); - __brelse(bh); /* One for getblk */ - /* AKPM: bforget here */ - } - - if (err) - journal_abort(journal, err); - - jbd_debug(3, "JBD: commit phase 6\n"); - - /* All metadata is written, now write commit record and do cleanup */ - spin_lock(&journal->j_state_lock); - J_ASSERT(commit_transaction->t_state == T_COMMIT); - commit_transaction->t_state = T_COMMIT_RECORD; - spin_unlock(&journal->j_state_lock); - - if (journal_write_commit_record(journal, commit_transaction)) - err = -EIO; - - if (err) - journal_abort(journal, err); - - /* End of a transaction! Finally, we can do checkpoint - processing: any buffers committed as a result of this - transaction can be removed from any checkpoint list it was on - before. */ - - jbd_debug(3, "JBD: commit phase 7\n"); - - J_ASSERT(commit_transaction->t_sync_datalist == NULL); - J_ASSERT(commit_transaction->t_buffers == NULL); - J_ASSERT(commit_transaction->t_checkpoint_list == NULL); - J_ASSERT(commit_transaction->t_iobuf_list == NULL); - J_ASSERT(commit_transaction->t_shadow_list == NULL); - J_ASSERT(commit_transaction->t_log_list == NULL); - -restart_loop: - /* - * As there are other places (journal_unmap_buffer()) adding buffers - * to this list we have to be careful and hold the j_list_lock. - */ - spin_lock(&journal->j_list_lock); - while (commit_transaction->t_forget) { - transaction_t *cp_transaction; - struct buffer_head *bh; - int try_to_free = 0; - - jh = commit_transaction->t_forget; - spin_unlock(&journal->j_list_lock); - bh = jh2bh(jh); - /* - * Get a reference so that bh cannot be freed before we are - * done with it. - */ - get_bh(bh); - jbd_lock_bh_state(bh); - J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || - jh->b_transaction == journal->j_running_transaction); - - /* - * If there is undo-protected committed data against - * this buffer, then we can remove it now. If it is a - * buffer needing such protection, the old frozen_data - * field now points to a committed version of the - * buffer, so rotate that field to the new committed - * data. - * - * Otherwise, we can just throw away the frozen data now. - */ - if (jh->b_committed_data) { - jbd_free(jh->b_committed_data, bh->b_size); - jh->b_committed_data = NULL; - if (jh->b_frozen_data) { - jh->b_committed_data = jh->b_frozen_data; - jh->b_frozen_data = NULL; - } - } else if (jh->b_frozen_data) { - jbd_free(jh->b_frozen_data, bh->b_size); - jh->b_frozen_data = NULL; - } - - spin_lock(&journal->j_list_lock); - cp_transaction = jh->b_cp_transaction; - if (cp_transaction) { - JBUFFER_TRACE(jh, "remove from old cp transaction"); - __journal_remove_checkpoint(jh); - } - - /* Only re-checkpoint the buffer_head if it is marked - * dirty. If the buffer was added to the BJ_Forget list - * by journal_forget, it may no longer be dirty and - * there's no point in keeping a checkpoint record for - * it. */ - - /* - * A buffer which has been freed while still being journaled by - * a previous transaction. - */ - if (buffer_freed(bh)) { - /* - * If the running transaction is the one containing - * "add to orphan" operation (b_next_transaction != - * NULL), we have to wait for that transaction to - * commit before we can really get rid of the buffer. - * So just clear b_modified to not confuse transaction - * credit accounting and refile the buffer to - * BJ_Forget of the running transaction. If the just - * committed transaction contains "add to orphan" - * operation, we can completely invalidate the buffer - * now. We are rather throughout in that since the - * buffer may be still accessible when blocksize < - * pagesize and it is attached to the last partial - * page. - */ - jh->b_modified = 0; - if (!jh->b_next_transaction) { - clear_buffer_freed(bh); - clear_buffer_jbddirty(bh); - clear_buffer_mapped(bh); - clear_buffer_new(bh); - clear_buffer_req(bh); - bh->b_bdev = NULL; - } - } - - if (buffer_jbddirty(bh)) { - JBUFFER_TRACE(jh, "add to new checkpointing trans"); - __journal_insert_checkpoint(jh, commit_transaction); - if (is_journal_aborted(journal)) - clear_buffer_jbddirty(bh); - } else { - J_ASSERT_BH(bh, !buffer_dirty(bh)); - /* - * The buffer on BJ_Forget list and not jbddirty means - * it has been freed by this transaction and hence it - * could not have been reallocated until this - * transaction has committed. *BUT* it could be - * reallocated once we have written all the data to - * disk and before we process the buffer on BJ_Forget - * list. - */ - if (!jh->b_next_transaction) - try_to_free = 1; - } - JBUFFER_TRACE(jh, "refile or unfile freed buffer"); - __journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); - if (try_to_free) - release_buffer_page(bh); - else - __brelse(bh); - cond_resched_lock(&journal->j_list_lock); - } - spin_unlock(&journal->j_list_lock); - /* - * This is a bit sleazy. We use j_list_lock to protect transition - * of a transaction into T_FINISHED state and calling - * __journal_drop_transaction(). Otherwise we could race with - * other checkpointing code processing the transaction... - */ - spin_lock(&journal->j_state_lock); - spin_lock(&journal->j_list_lock); - /* - * Now recheck if some buffers did not get attached to the transaction - * while the lock was dropped... - */ - if (commit_transaction->t_forget) { - spin_unlock(&journal->j_list_lock); - spin_unlock(&journal->j_state_lock); - goto restart_loop; - } - - /* Done with this transaction! */ - - jbd_debug(3, "JBD: commit phase 8\n"); - - J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD); - - commit_transaction->t_state = T_FINISHED; - J_ASSERT(commit_transaction == journal->j_committing_transaction); - journal->j_commit_sequence = commit_transaction->t_tid; - journal->j_committing_transaction = NULL; - commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); - - /* - * weight the commit time higher than the average time so we don't - * react too strongly to vast changes in commit time - */ - if (likely(journal->j_average_commit_time)) - journal->j_average_commit_time = (commit_time*3 + - journal->j_average_commit_time) / 4; - else - journal->j_average_commit_time = commit_time; - - spin_unlock(&journal->j_state_lock); - - if (commit_transaction->t_checkpoint_list == NULL && - commit_transaction->t_checkpoint_io_list == NULL) { - __journal_drop_transaction(journal, commit_transaction); - } else { - if (journal->j_checkpoint_transactions == NULL) { - journal->j_checkpoint_transactions = commit_transaction; - commit_transaction->t_cpnext = commit_transaction; - commit_transaction->t_cpprev = commit_transaction; - } else { - commit_transaction->t_cpnext = - journal->j_checkpoint_transactions; - commit_transaction->t_cpprev = - commit_transaction->t_cpnext->t_cpprev; - commit_transaction->t_cpnext->t_cpprev = - commit_transaction; - commit_transaction->t_cpprev->t_cpnext = - commit_transaction; - } - } - spin_unlock(&journal->j_list_lock); - - trace_jbd_end_commit(journal, commit_transaction); - jbd_debug(1, "JBD: commit %d complete, head %d\n", - journal->j_commit_sequence, journal->j_tail_sequence); - - wake_up(&journal->j_wait_done_commit); -} diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c deleted file mode 100644 index c46a79adb6ad..000000000000 --- a/fs/jbd/journal.c +++ /dev/null @@ -1,2145 +0,0 @@ -/* - * linux/fs/jbd/journal.c - * - * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 - * - * Copyright 1998 Red Hat corp --- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * Generic filesystem journal-writing code; part of the ext2fs - * journaling system. - * - * This file manages journals: areas of disk reserved for logging - * transactional updates. This includes the kernel journaling thread - * which is responsible for scheduling updates to the log. - * - * We do not actually manage the physical storage of the journal in this - * file: that is left to a per-journal policy function, which allows us - * to store the journal within a filesystem-specified area for ext2 - * journaling (ext2 can use a reserved inode for storing the log). - */ - -#include <linux/module.h> -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/mm.h> -#include <linux/freezer.h> -#include <linux/pagemap.h> -#include <linux/kthread.h> -#include <linux/poison.h> -#include <linux/proc_fs.h> -#include <linux/debugfs.h> -#include <linux/ratelimit.h> - -#define CREATE_TRACE_POINTS -#include <trace/events/jbd.h> - -#include <asm/uaccess.h> -#include <asm/page.h> - -EXPORT_SYMBOL(journal_start); -EXPORT_SYMBOL(journal_restart); -EXPORT_SYMBOL(journal_extend); -EXPORT_SYMBOL(journal_stop); -EXPORT_SYMBOL(journal_lock_updates); -EXPORT_SYMBOL(journal_unlock_updates); -EXPORT_SYMBOL(journal_get_write_access); -EXPORT_SYMBOL(journal_get_create_access); -EXPORT_SYMBOL(journal_get_undo_access); -EXPORT_SYMBOL(journal_dirty_data); -EXPORT_SYMBOL(journal_dirty_metadata); -EXPORT_SYMBOL(journal_release_buffer); -EXPORT_SYMBOL(journal_forget); -#if 0 -EXPORT_SYMBOL(journal_sync_buffer); -#endif -EXPORT_SYMBOL(journal_flush); -EXPORT_SYMBOL(journal_revoke); - -EXPORT_SYMBOL(journal_init_dev); -EXPORT_SYMBOL(journal_init_inode); -EXPORT_SYMBOL(journal_update_format); -EXPORT_SYMBOL(journal_check_used_features); -EXPORT_SYMBOL(journal_check_available_features); -EXPORT_SYMBOL(journal_set_features); -EXPORT_SYMBOL(journal_create); -EXPORT_SYMBOL(journal_load); -EXPORT_SYMBOL(journal_destroy); -EXPORT_SYMBOL(journal_abort); -EXPORT_SYMBOL(journal_errno); -EXPORT_SYMBOL(journal_ack_err); -EXPORT_SYMBOL(journal_clear_err); -EXPORT_SYMBOL(log_wait_commit); -EXPORT_SYMBOL(log_start_commit); -EXPORT_SYMBOL(journal_start_commit); -EXPORT_SYMBOL(journal_force_commit_nested); -EXPORT_SYMBOL(journal_wipe); -EXPORT_SYMBOL(journal_blocks_per_page); -EXPORT_SYMBOL(journal_invalidatepage); -EXPORT_SYMBOL(journal_try_to_free_buffers); -EXPORT_SYMBOL(journal_force_commit); - -static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); -static void __journal_abort_soft (journal_t *journal, int errno); -static const char *journal_dev_name(journal_t *journal, char *buffer); - -#ifdef CONFIG_JBD_DEBUG -void __jbd_debug(int level, const char *file, const char *func, - unsigned int line, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - if (level > journal_enable_debug) - return; - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf); - va_end(args); -} -EXPORT_SYMBOL(__jbd_debug); -#endif - -/* - * Helper function used to manage commit timeouts - */ - -static void commit_timeout(unsigned long __data) -{ - struct task_struct * p = (struct task_struct *) __data; - - wake_up_process(p); -} - -/* - * kjournald: The main thread function used to manage a logging device - * journal. - * - * This kernel thread is responsible for two things: - * - * 1) COMMIT: Every so often we need to commit the current state of the - * filesystem to disk. The journal thread is responsible for writing - * all of the metadata buffers to disk. - * - * 2) CHECKPOINT: We cannot reuse a used section of the log file until all - * of the data in that part of the log has been rewritten elsewhere on - * the disk. Flushing these old buffers to reclaim space in the log is - * known as checkpointing, and this thread is responsible for that job. - */ - -static int kjournald(void *arg) -{ - journal_t *journal = arg; - transaction_t *transaction; - - /* - * Set up an interval timer which can be used to trigger a commit wakeup - * after the commit interval expires - */ - setup_timer(&journal->j_commit_timer, commit_timeout, - (unsigned long)current); - - set_freezable(); - - /* Record that the journal thread is running */ - journal->j_task = current; - wake_up(&journal->j_wait_done_commit); - - printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n", - journal->j_commit_interval / HZ); - - /* - * And now, wait forever for commit wakeup events. - */ - spin_lock(&journal->j_state_lock); - -loop: - if (journal->j_flags & JFS_UNMOUNT) - goto end_loop; - - jbd_debug(1, "commit_sequence=%d, commit_request=%d\n", - journal->j_commit_sequence, journal->j_commit_request); - - if (journal->j_commit_sequence != journal->j_commit_request) { - jbd_debug(1, "OK, requests differ\n"); - spin_unlock(&journal->j_state_lock); - del_timer_sync(&journal->j_commit_timer); - journal_commit_transaction(journal); - spin_lock(&journal->j_state_lock); - goto loop; - } - - wake_up(&journal->j_wait_done_commit); - if (freezing(current)) { - /* - * The simpler the better. Flushing journal isn't a - * good idea, because that depends on threads that may - * be already stopped. - */ - jbd_debug(1, "Now suspending kjournald\n"); - spin_unlock(&journal->j_state_lock); - try_to_freeze(); - spin_lock(&journal->j_state_lock); - } else { - /* - * We assume on resume that commits are already there, - * so we don't sleep - */ - DEFINE_WAIT(wait); - int should_sleep = 1; - - prepare_to_wait(&journal->j_wait_commit, &wait, - TASK_INTERRUPTIBLE); - if (journal->j_commit_sequence != journal->j_commit_request) - should_sleep = 0; - transaction = journal->j_running_transaction; - if (transaction && time_after_eq(jiffies, - transaction->t_expires)) - should_sleep = 0; - if (journal->j_flags & JFS_UNMOUNT) - should_sleep = 0; - if (should_sleep) { - spin_unlock(&journal->j_state_lock); - schedule(); - spin_lock(&journal->j_state_lock); - } - finish_wait(&journal->j_wait_commit, &wait); - } - - jbd_debug(1, "kjournald wakes\n"); - - /* - * Were we woken up by a commit wakeup event? - */ - transaction = journal->j_running_transaction; - if (transaction && time_after_eq(jiffies, transaction->t_expires)) { - journal->j_commit_request = transaction->t_tid; - jbd_debug(1, "woke because of timeout\n"); - } - goto loop; - -end_loop: - spin_unlock(&journal->j_state_lock); - del_timer_sync(&journal->j_commit_timer); - journal->j_task = NULL; - wake_up(&journal->j_wait_done_commit); - jbd_debug(1, "Journal thread exiting.\n"); - return 0; -} - -static int journal_start_thread(journal_t *journal) -{ - struct task_struct *t; - - t = kthread_run(kjournald, journal, "kjournald"); - if (IS_ERR(t)) - return PTR_ERR(t); - - wait_event(journal->j_wait_done_commit, journal->j_task != NULL); - return 0; -} - -static void journal_kill_thread(journal_t *journal) -{ - spin_lock(&journal->j_state_lock); - journal->j_flags |= JFS_UNMOUNT; - - while (journal->j_task) { - wake_up(&journal->j_wait_commit); - spin_unlock(&journal->j_state_lock); - wait_event(journal->j_wait_done_commit, - journal->j_task == NULL); - spin_lock(&journal->j_state_lock); - } - spin_unlock(&journal->j_state_lock); -} - -/* - * journal_write_metadata_buffer: write a metadata buffer to the journal. - * - * Writes a metadata buffer to a given disk block. The actual IO is not - * performed but a new buffer_head is constructed which labels the data - * to be written with the correct destination disk block. - * - * Any magic-number escaping which needs to be done will cause a - * copy-out here. If the buffer happens to start with the - * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the - * magic number is only written to the log for descripter blocks. In - * this case, we copy the data and replace the first word with 0, and we - * return a result code which indicates that this buffer needs to be - * marked as an escaped buffer in the corresponding log descriptor - * block. The missing word can then be restored when the block is read - * during recovery. - * - * If the source buffer has already been modified by a new transaction - * since we took the last commit snapshot, we use the frozen copy of - * that data for IO. If we end up using the existing buffer_head's data - * for the write, then we *have* to lock the buffer to prevent anyone - * else from using and possibly modifying it while the IO is in - * progress. - * - * The function returns a pointer to the buffer_heads to be used for IO. - * - * We assume that the journal has already been locked in this function. - * - * Return value: - * <0: Error - * >=0: Finished OK - * - * On success: - * Bit 0 set == escape performed on the data - * Bit 1 set == buffer copy-out performed (kfree the data after IO) - */ - -int journal_write_metadata_buffer(transaction_t *transaction, - struct journal_head *jh_in, - struct journal_head **jh_out, - unsigned int blocknr) -{ - int need_copy_out = 0; - int done_copy_out = 0; - int do_escape = 0; - char *mapped_data; - struct buffer_head *new_bh; - struct journal_head *new_jh; - struct page *new_page; - unsigned int new_offset; - struct buffer_head *bh_in = jh2bh(jh_in); - journal_t *journal = transaction->t_journal; - - /* - * The buffer really shouldn't be locked: only the current committing - * transaction is allowed to write it, so nobody else is allowed - * to do any IO. - * - * akpm: except if we're journalling data, and write() output is - * also part of a shared mapping, and another thread has - * decided to launch a writepage() against this buffer. - */ - J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); - - new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); - /* keep subsequent assertions sane */ - atomic_set(&new_bh->b_count, 1); - new_jh = journal_add_journal_head(new_bh); /* This sleeps */ - - /* - * If a new transaction has already done a buffer copy-out, then - * we use that version of the data for the commit. - */ - jbd_lock_bh_state(bh_in); -repeat: - if (jh_in->b_frozen_data) { - done_copy_out = 1; - new_page = virt_to_page(jh_in->b_frozen_data); - new_offset = offset_in_page(jh_in->b_frozen_data); - } else { - new_page = jh2bh(jh_in)->b_page; - new_offset = offset_in_page(jh2bh(jh_in)->b_data); - } - - mapped_data = kmap_atomic(new_page); - /* - * Check for escaping - */ - if (*((__be32 *)(mapped_data + new_offset)) == - cpu_to_be32(JFS_MAGIC_NUMBER)) { - need_copy_out = 1; - do_escape = 1; - } - kunmap_atomic(mapped_data); - - /* - * Do we need to do a data copy? - */ - if (need_copy_out && !done_copy_out) { - char *tmp; - - jbd_unlock_bh_state(bh_in); - tmp = jbd_alloc(bh_in->b_size, GFP_NOFS); - jbd_lock_bh_state(bh_in); - if (jh_in->b_frozen_data) { - jbd_free(tmp, bh_in->b_size); - goto repeat; - } - - jh_in->b_frozen_data = tmp; - mapped_data = kmap_atomic(new_page); - memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); - kunmap_atomic(mapped_data); - - new_page = virt_to_page(tmp); - new_offset = offset_in_page(tmp); - done_copy_out = 1; - } - - /* - * Did we need to do an escaping? Now we've done all the - * copying, we can finally do so. - */ - if (do_escape) { - mapped_data = kmap_atomic(new_page); - *((unsigned int *)(mapped_data + new_offset)) = 0; - kunmap_atomic(mapped_data); - } - - set_bh_page(new_bh, new_page, new_offset); - new_jh->b_transaction = NULL; - new_bh->b_size = jh2bh(jh_in)->b_size; - new_bh->b_bdev = transaction->t_journal->j_dev; - new_bh->b_blocknr = blocknr; - set_buffer_mapped(new_bh); - set_buffer_dirty(new_bh); - - *jh_out = new_jh; - - /* - * The to-be-written buffer needs to get moved to the io queue, - * and the original buffer whose contents we are shadowing or - * copying is moved to the transaction's shadow queue. - */ - JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); - spin_lock(&journal->j_list_lock); - __journal_file_buffer(jh_in, transaction, BJ_Shadow); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh_in); - - JBUFFER_TRACE(new_jh, "file as BJ_IO"); - journal_file_buffer(new_jh, transaction, BJ_IO); - - return do_escape | (done_copy_out << 1); -} - -/* - * Allocation code for the journal file. Manage the space left in the - * journal, so that we can begin checkpointing when appropriate. - */ - -/* - * __log_space_left: Return the number of free blocks left in the journal. - * - * Called with the journal already locked. - * - * Called under j_state_lock - */ - -int __log_space_left(journal_t *journal) -{ - int left = journal->j_free; - - assert_spin_locked(&journal->j_state_lock); - - /* - * Be pessimistic here about the number of those free blocks which - * might be required for log descriptor control blocks. - */ - -#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */ - - left -= MIN_LOG_RESERVED_BLOCKS; - - if (left <= 0) - return 0; - left -= (left >> 3); - return left; -} - -/* - * Called under j_state_lock. Returns true if a transaction commit was started. - */ -int __log_start_commit(journal_t *journal, tid_t target) -{ - /* - * The only transaction we can possibly wait upon is the - * currently running transaction (if it exists). Otherwise, - * the target tid must be an old one. - */ - if (journal->j_commit_request != target && - journal->j_running_transaction && - journal->j_running_transaction->t_tid == target) { - /* - * We want a new commit: OK, mark the request and wakeup the - * commit thread. We do _not_ do the commit ourselves. - */ - - journal->j_commit_request = target; - jbd_debug(1, "JBD: requesting commit %d/%d\n", - journal->j_commit_request, - journal->j_commit_sequence); - wake_up(&journal->j_wait_commit); - return 1; - } else if (!tid_geq(journal->j_commit_request, target)) - /* This should never happen, but if it does, preserve - the evidence before kjournald goes into a loop and - increments j_commit_sequence beyond all recognition. */ - WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n", - journal->j_commit_request, journal->j_commit_sequence, - target, journal->j_running_transaction ? - journal->j_running_transaction->t_tid : 0); - return 0; -} - -int log_start_commit(journal_t *journal, tid_t tid) -{ - int ret; - - spin_lock(&journal->j_state_lock); - ret = __log_start_commit(journal, tid); - spin_unlock(&journal->j_state_lock); - return ret; -} - -/* - * Force and wait upon a commit if the calling process is not within - * transaction. This is used for forcing out undo-protected data which contains - * bitmaps, when the fs is running out of space. - * - * We can only force the running transaction if we don't have an active handle; - * otherwise, we will deadlock. - * - * Returns true if a transaction was started. - */ -int journal_force_commit_nested(journal_t *journal) -{ - transaction_t *transaction = NULL; - tid_t tid; - - spin_lock(&journal->j_state_lock); - if (journal->j_running_transaction && !current->journal_info) { - transaction = journal->j_running_transaction; - __log_start_commit(journal, transaction->t_tid); - } else if (journal->j_committing_transaction) - transaction = journal->j_committing_transaction; - - if (!transaction) { - spin_unlock(&journal->j_state_lock); - return 0; /* Nothing to retry */ - } - - tid = transaction->t_tid; - spin_unlock(&journal->j_state_lock); - log_wait_commit(journal, tid); - return 1; -} - -/* - * Start a commit of the current running transaction (if any). Returns true - * if a transaction is going to be committed (or is currently already - * committing), and fills its tid in at *ptid - */ -int journal_start_commit(journal_t *journal, tid_t *ptid) -{ - int ret = 0; - - spin_lock(&journal->j_state_lock); - if (journal->j_running_transaction) { - tid_t tid = journal->j_running_transaction->t_tid; - - __log_start_commit(journal, tid); - /* There's a running transaction and we've just made sure - * it's commit has been scheduled. */ - if (ptid) - *ptid = tid; - ret = 1; - } else if (journal->j_committing_transaction) { - /* - * If commit has been started, then we have to wait for - * completion of that transaction. - */ - if (ptid) - *ptid = journal->j_committing_transaction->t_tid; - ret = 1; - } - spin_unlock(&journal->j_state_lock); - return ret; -} - -/* - * Wait for a specified commit to complete. - * The caller may not hold the journal lock. - */ -int log_wait_commit(journal_t *journal, tid_t tid) -{ - int err = 0; - -#ifdef CONFIG_JBD_DEBUG - spin_lock(&journal->j_state_lock); - if (!tid_geq(journal->j_commit_request, tid)) { - printk(KERN_ERR - "%s: error: j_commit_request=%d, tid=%d\n", - __func__, journal->j_commit_request, tid); - } - spin_unlock(&journal->j_state_lock); -#endif - spin_lock(&journal->j_state_lock); - /* - * Not running or committing trans? Must be already committed. This - * saves us from waiting for a *long* time when tid overflows. - */ - if (!((journal->j_running_transaction && - journal->j_running_transaction->t_tid == tid) || - (journal->j_committing_transaction && - journal->j_committing_transaction->t_tid == tid))) - goto out_unlock; - - if (!tid_geq(journal->j_commit_waited, tid)) - journal->j_commit_waited = tid; - while (tid_gt(tid, journal->j_commit_sequence)) { - jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", - tid, journal->j_commit_sequence); - wake_up(&journal->j_wait_commit); - spin_unlock(&journal->j_state_lock); - wait_event(journal->j_wait_done_commit, - !tid_gt(tid, journal->j_commit_sequence)); - spin_lock(&journal->j_state_lock); - } -out_unlock: - spin_unlock(&journal->j_state_lock); - - if (unlikely(is_journal_aborted(journal))) - err = -EIO; - return err; -} - -/* - * Return 1 if a given transaction has not yet sent barrier request - * connected with a transaction commit. If 0 is returned, transaction - * may or may not have sent the barrier. Used to avoid sending barrier - * twice in common cases. - */ -int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid) -{ - int ret = 0; - transaction_t *commit_trans; - - if (!(journal->j_flags & JFS_BARRIER)) - return 0; - spin_lock(&journal->j_state_lock); - /* Transaction already committed? */ - if (tid_geq(journal->j_commit_sequence, tid)) - goto out; - /* - * Transaction is being committed and we already proceeded to - * writing commit record? - */ - commit_trans = journal->j_committing_transaction; - if (commit_trans && commit_trans->t_tid == tid && - commit_trans->t_state >= T_COMMIT_RECORD) - goto out; - ret = 1; -out: - spin_unlock(&journal->j_state_lock); - return ret; -} -EXPORT_SYMBOL(journal_trans_will_send_data_barrier); - -/* - * Log buffer allocation routines: - */ - -int journal_next_log_block(journal_t *journal, unsigned int *retp) -{ - unsigned int blocknr; - - spin_lock(&journal->j_state_lock); - J_ASSERT(journal->j_free > 1); - - blocknr = journal->j_head; - journal->j_head++; - journal->j_free--; - if (journal->j_head == journal->j_last) - journal->j_head = journal->j_first; - spin_unlock(&journal->j_state_lock); - return journal_bmap(journal, blocknr, retp); -} - -/* - * Conversion of logical to physical block numbers for the journal - * - * On external journals the journal blocks are identity-mapped, so - * this is a no-op. If needed, we can use j_blk_offset - everything is - * ready. - */ -int journal_bmap(journal_t *journal, unsigned int blocknr, - unsigned int *retp) -{ - int err = 0; - unsigned int ret; - - if (journal->j_inode) { - ret = bmap(journal->j_inode, blocknr); - if (ret) - *retp = ret; - else { - char b[BDEVNAME_SIZE]; - - printk(KERN_ALERT "%s: journal block not found " - "at offset %u on %s\n", - __func__, - blocknr, - bdevname(journal->j_dev, b)); - err = -EIO; - __journal_abort_soft(journal, err); - } - } else { - *retp = blocknr; /* +journal->j_blk_offset */ - } - return err; -} - -/* - * We play buffer_head aliasing tricks to write data/metadata blocks to - * the journal without copying their contents, but for journal - * descriptor blocks we do need to generate bona fide buffers. - * - * After the caller of journal_get_descriptor_buffer() has finished modifying - * the buffer's contents they really should run flush_dcache_page(bh->b_page). - * But we don't bother doing that, so there will be coherency problems with - * mmaps of blockdevs which hold live JBD-controlled filesystems. - */ -struct journal_head *journal_get_descriptor_buffer(journal_t *journal) -{ - struct buffer_head *bh; - unsigned int blocknr; - int err; - - err = journal_next_log_block(journal, &blocknr); - - if (err) - return NULL; - - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - if (!bh) - return NULL; - lock_buffer(bh); - memset(bh->b_data, 0, journal->j_blocksize); - set_buffer_uptodate(bh); - unlock_buffer(bh); - BUFFER_TRACE(bh, "return this buffer"); - return journal_add_journal_head(bh); -} - -/* - * Management for journal control blocks: functions to create and - * destroy journal_t structures, and to initialise and read existing - * journal blocks from disk. */ - -/* First: create and setup a journal_t object in memory. We initialise - * very few fields yet: that has to wait until we have created the - * journal structures from from scratch, or loaded them from disk. */ - -static journal_t * journal_init_common (void) -{ - journal_t *journal; - int err; - - journal = kzalloc(sizeof(*journal), GFP_KERNEL); - if (!journal) - goto fail; - - init_waitqueue_head(&journal->j_wait_transaction_locked); - init_waitqueue_head(&journal->j_wait_logspace); - init_waitqueue_head(&journal->j_wait_done_commit); - init_waitqueue_head(&journal->j_wait_checkpoint); - init_waitqueue_head(&journal->j_wait_commit); - init_waitqueue_head(&journal->j_wait_updates); - mutex_init(&journal->j_checkpoint_mutex); - spin_lock_init(&journal->j_revoke_lock); - spin_lock_init(&journal->j_list_lock); - spin_lock_init(&journal->j_state_lock); - - journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE); - - /* The journal is marked for error until we succeed with recovery! */ - journal->j_flags = JFS_ABORT; - - /* Set up a default-sized revoke table for the new mount. */ - err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); - if (err) { - kfree(journal); - goto fail; - } - return journal; -fail: - return NULL; -} - -/* journal_init_dev and journal_init_inode: - * - * Create a journal structure assigned some fixed set of disk blocks to - * the journal. We don't actually touch those disk blocks yet, but we - * need to set up all of the mapping information to tell the journaling - * system where the journal blocks are. - * - */ - -/** - * journal_t * journal_init_dev() - creates and initialises a journal structure - * @bdev: Block device on which to create the journal - * @fs_dev: Device which hold journalled filesystem for this journal. - * @start: Block nr Start of journal. - * @len: Length of the journal in blocks. - * @blocksize: blocksize of journalling device - * - * Returns: a newly created journal_t * - * - * journal_init_dev creates a journal which maps a fixed contiguous - * range of blocks on an arbitrary block device. - * - */ -journal_t * journal_init_dev(struct block_device *bdev, - struct block_device *fs_dev, - int start, int len, int blocksize) -{ - journal_t *journal = journal_init_common(); - struct buffer_head *bh; - int n; - - if (!journal) - return NULL; - - /* journal descriptor can store up to n blocks -bzzz */ - journal->j_blocksize = blocksize; - n = journal->j_blocksize / sizeof(journal_block_tag_t); - journal->j_wbufsize = n; - journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); - if (!journal->j_wbuf) { - printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", - __func__); - goto out_err; - } - journal->j_dev = bdev; - journal->j_fs_dev = fs_dev; - journal->j_blk_offset = start; - journal->j_maxlen = len; - - bh = __getblk(journal->j_dev, start, journal->j_blocksize); - if (!bh) { - printk(KERN_ERR - "%s: Cannot get buffer for journal superblock\n", - __func__); - goto out_err; - } - journal->j_sb_buffer = bh; - journal->j_superblock = (journal_superblock_t *)bh->b_data; - - return journal; -out_err: - kfree(journal->j_wbuf); - kfree(journal); - return NULL; -} - -/** - * journal_t * journal_init_inode () - creates a journal which maps to a inode. - * @inode: An inode to create the journal in - * - * journal_init_inode creates a journal which maps an on-disk inode as - * the journal. The inode must exist already, must support bmap() and - * must have all data blocks preallocated. - */ -journal_t * journal_init_inode (struct inode *inode) -{ - struct buffer_head *bh; - journal_t *journal = journal_init_common(); - int err; - int n; - unsigned int blocknr; - - if (!journal) - return NULL; - - journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; - journal->j_inode = inode; - jbd_debug(1, - "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", - journal, inode->i_sb->s_id, inode->i_ino, - (long long) inode->i_size, - inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); - - journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; - journal->j_blocksize = inode->i_sb->s_blocksize; - - /* journal descriptor can store up to n blocks -bzzz */ - n = journal->j_blocksize / sizeof(journal_block_tag_t); - journal->j_wbufsize = n; - journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); - if (!journal->j_wbuf) { - printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", - __func__); - goto out_err; - } - - err = journal_bmap(journal, 0, &blocknr); - /* If that failed, give up */ - if (err) { - printk(KERN_ERR "%s: Cannot locate journal superblock\n", - __func__); - goto out_err; - } - - bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize); - if (!bh) { - printk(KERN_ERR - "%s: Cannot get buffer for journal superblock\n", - __func__); - goto out_err; - } - journal->j_sb_buffer = bh; - journal->j_superblock = (journal_superblock_t *)bh->b_data; - - return journal; -out_err: - kfree(journal->j_wbuf); - kfree(journal); - return NULL; -} - -/* - * If the journal init or create aborts, we need to mark the journal - * superblock as being NULL to prevent the journal destroy from writing - * back a bogus superblock. - */ -static void journal_fail_superblock (journal_t *journal) -{ - struct buffer_head *bh = journal->j_sb_buffer; - brelse(bh); - journal->j_sb_buffer = NULL; -} - -/* - * Given a journal_t structure, initialise the various fields for - * startup of a new journaling session. We use this both when creating - * a journal, and after recovering an old journal to reset it for - * subsequent use. - */ - -static int journal_reset(journal_t *journal) -{ - journal_superblock_t *sb = journal->j_superblock; - unsigned int first, last; - - first = be32_to_cpu(sb->s_first); - last = be32_to_cpu(sb->s_maxlen); - if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) { - printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n", - first, last); - journal_fail_superblock(journal); - return -EINVAL; - } - - journal->j_first = first; - journal->j_last = last; - - journal->j_head = first; - journal->j_tail = first; - journal->j_free = last - first; - - journal->j_tail_sequence = journal->j_transaction_sequence; - journal->j_commit_sequence = journal->j_transaction_sequence - 1; - journal->j_commit_request = journal->j_commit_sequence; - - journal->j_max_transaction_buffers = journal->j_maxlen / 4; - - /* - * As a special case, if the on-disk copy is already marked as needing - * no recovery (s_start == 0), then we can safely defer the superblock - * update until the next commit by setting JFS_FLUSHED. This avoids - * attempting a write to a potential-readonly device. - */ - if (sb->s_start == 0) { - jbd_debug(1,"JBD: Skipping superblock update on recovered sb " - "(start %u, seq %d, errno %d)\n", - journal->j_tail, journal->j_tail_sequence, - journal->j_errno); - journal->j_flags |= JFS_FLUSHED; - } else { - /* Lock here to make assertions happy... */ - mutex_lock(&journal->j_checkpoint_mutex); - /* - * Update log tail information. We use WRITE_FUA since new - * transaction will start reusing journal space and so we - * must make sure information about current log tail is on - * disk before that. - */ - journal_update_sb_log_tail(journal, - journal->j_tail_sequence, - journal->j_tail, - WRITE_FUA); - mutex_unlock(&journal->j_checkpoint_mutex); - } - return journal_start_thread(journal); -} - -/** - * int journal_create() - Initialise the new journal file - * @journal: Journal to create. This structure must have been initialised - * - * Given a journal_t structure which tells us which disk blocks we can - * use, create a new journal superblock and initialise all of the - * journal fields from scratch. - **/ -int journal_create(journal_t *journal) -{ - unsigned int blocknr; - struct buffer_head *bh; - journal_superblock_t *sb; - int i, err; - - if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) { - printk (KERN_ERR "Journal length (%d blocks) too short.\n", - journal->j_maxlen); - journal_fail_superblock(journal); - return -EINVAL; - } - - if (journal->j_inode == NULL) { - /* - * We don't know what block to start at! - */ - printk(KERN_EMERG - "%s: creation of journal on external device!\n", - __func__); - BUG(); - } - - /* Zero out the entire journal on disk. We cannot afford to - have any blocks on disk beginning with JFS_MAGIC_NUMBER. */ - jbd_debug(1, "JBD: Zeroing out journal blocks...\n"); - for (i = 0; i < journal->j_maxlen; i++) { - err = journal_bmap(journal, i, &blocknr); - if (err) - return err; - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - if (unlikely(!bh)) - return -ENOMEM; - lock_buffer(bh); - memset (bh->b_data, 0, journal->j_blocksize); - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - BUFFER_TRACE(bh, "marking uptodate"); - set_buffer_uptodate(bh); - unlock_buffer(bh); - __brelse(bh); - } - - sync_blockdev(journal->j_dev); - jbd_debug(1, "JBD: journal cleared.\n"); - - /* OK, fill in the initial static fields in the new superblock */ - sb = journal->j_superblock; - - sb->s_header.h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); - sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); - - sb->s_blocksize = cpu_to_be32(journal->j_blocksize); - sb->s_maxlen = cpu_to_be32(journal->j_maxlen); - sb->s_first = cpu_to_be32(1); - - journal->j_transaction_sequence = 1; - - journal->j_flags &= ~JFS_ABORT; - journal->j_format_version = 2; - - return journal_reset(journal); -} - -static void journal_write_superblock(journal_t *journal, int write_op) -{ - struct buffer_head *bh = journal->j_sb_buffer; - int ret; - - trace_journal_write_superblock(journal, write_op); - if (!(journal->j_flags & JFS_BARRIER)) - write_op &= ~(REQ_FUA | REQ_FLUSH); - lock_buffer(bh); - if (buffer_write_io_error(bh)) { - char b[BDEVNAME_SIZE]; - /* - * Oh, dear. A previous attempt to write the journal - * superblock failed. This could happen because the - * USB device was yanked out. Or it could happen to - * be a transient write error and maybe the block will - * be remapped. Nothing we can do but to retry the - * write and hope for the best. - */ - printk(KERN_ERR "JBD: previous I/O error detected " - "for journal superblock update for %s.\n", - journal_dev_name(journal, b)); - clear_buffer_write_io_error(bh); - set_buffer_uptodate(bh); - } - - get_bh(bh); - bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(write_op, bh); - wait_on_buffer(bh); - if (buffer_write_io_error(bh)) { - clear_buffer_write_io_error(bh); - set_buffer_uptodate(bh); - ret = -EIO; - } - if (ret) { - char b[BDEVNAME_SIZE]; - printk(KERN_ERR "JBD: Error %d detected " - "when updating journal superblock for %s.\n", - ret, journal_dev_name(journal, b)); - } -} - -/** - * journal_update_sb_log_tail() - Update log tail in journal sb on disk. - * @journal: The journal to update. - * @tail_tid: TID of the new transaction at the tail of the log - * @tail_block: The first block of the transaction at the tail of the log - * @write_op: With which operation should we write the journal sb - * - * Update a journal's superblock information about log tail and write it to - * disk, waiting for the IO to complete. - */ -void journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, - unsigned int tail_block, int write_op) -{ - journal_superblock_t *sb = journal->j_superblock; - - BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); - jbd_debug(1,"JBD: updating superblock (start %u, seq %u)\n", - tail_block, tail_tid); - - sb->s_sequence = cpu_to_be32(tail_tid); - sb->s_start = cpu_to_be32(tail_block); - - journal_write_superblock(journal, write_op); - - /* Log is no longer empty */ - spin_lock(&journal->j_state_lock); - WARN_ON(!sb->s_sequence); - journal->j_flags &= ~JFS_FLUSHED; - spin_unlock(&journal->j_state_lock); -} - -/** - * mark_journal_empty() - Mark on disk journal as empty. - * @journal: The journal to update. - * - * Update a journal's dynamic superblock fields to show that journal is empty. - * Write updated superblock to disk waiting for IO to complete. - */ -static void mark_journal_empty(journal_t *journal) -{ - journal_superblock_t *sb = journal->j_superblock; - - BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); - spin_lock(&journal->j_state_lock); - /* Is it already empty? */ - if (sb->s_start == 0) { - spin_unlock(&journal->j_state_lock); - return; - } - jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n", - journal->j_tail_sequence); - - sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); - sb->s_start = cpu_to_be32(0); - spin_unlock(&journal->j_state_lock); - - journal_write_superblock(journal, WRITE_FUA); - - spin_lock(&journal->j_state_lock); - /* Log is empty */ - journal->j_flags |= JFS_FLUSHED; - spin_unlock(&journal->j_state_lock); -} - -/** - * journal_update_sb_errno() - Update error in the journal. - * @journal: The journal to update. - * - * Update a journal's errno. Write updated superblock to disk waiting for IO - * to complete. - */ -static void journal_update_sb_errno(journal_t *journal) -{ - journal_superblock_t *sb = journal->j_superblock; - - spin_lock(&journal->j_state_lock); - jbd_debug(1, "JBD: updating superblock error (errno %d)\n", - journal->j_errno); - sb->s_errno = cpu_to_be32(journal->j_errno); - spin_unlock(&journal->j_state_lock); - - journal_write_superblock(journal, WRITE_SYNC); -} - -/* - * Read the superblock for a given journal, performing initial - * validation of the format. - */ - -static int journal_get_superblock(journal_t *journal) -{ - struct buffer_head *bh; - journal_superblock_t *sb; - int err = -EIO; - - bh = journal->j_sb_buffer; - - J_ASSERT(bh != NULL); - if (!buffer_uptodate(bh)) { - ll_rw_block(READ, 1, &bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - printk (KERN_ERR - "JBD: IO error reading journal superblock\n"); - goto out; - } - } - - sb = journal->j_superblock; - - err = -EINVAL; - - if (sb->s_header.h_magic != cpu_to_be32(JFS_MAGIC_NUMBER) || - sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { - printk(KERN_WARNING "JBD: no valid journal superblock found\n"); - goto out; - } - - switch(be32_to_cpu(sb->s_header.h_blocktype)) { - case JFS_SUPERBLOCK_V1: - journal->j_format_version = 1; - break; - case JFS_SUPERBLOCK_V2: - journal->j_format_version = 2; - break; - default: - printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); - goto out; - } - - if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen) - journal->j_maxlen = be32_to_cpu(sb->s_maxlen); - else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) { - printk (KERN_WARNING "JBD: journal file too short\n"); - goto out; - } - - if (be32_to_cpu(sb->s_first) == 0 || - be32_to_cpu(sb->s_first) >= journal->j_maxlen) { - printk(KERN_WARNING - "JBD: Invalid start block of journal: %u\n", - be32_to_cpu(sb->s_first)); - goto out; - } - - return 0; - -out: - journal_fail_superblock(journal); - return err; -} - -/* - * Load the on-disk journal superblock and read the key fields into the - * journal_t. - */ - -static int load_superblock(journal_t *journal) -{ - int err; - journal_superblock_t *sb; - - err = journal_get_superblock(journal); - if (err) - return err; - - sb = journal->j_superblock; - - journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); - journal->j_tail = be32_to_cpu(sb->s_start); - journal->j_first = be32_to_cpu(sb->s_first); - journal->j_last = be32_to_cpu(sb->s_maxlen); - journal->j_errno = be32_to_cpu(sb->s_errno); - - return 0; -} - - -/** - * int journal_load() - Read journal from disk. - * @journal: Journal to act on. - * - * Given a journal_t structure which tells us which disk blocks contain - * a journal, read the journal from disk to initialise the in-memory - * structures. - */ -int journal_load(journal_t *journal) -{ - int err; - journal_superblock_t *sb; - - err = load_superblock(journal); - if (err) - return err; - - sb = journal->j_superblock; - /* If this is a V2 superblock, then we have to check the - * features flags on it. */ - - if (journal->j_format_version >= 2) { - if ((sb->s_feature_ro_compat & - ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) || - (sb->s_feature_incompat & - ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) { - printk (KERN_WARNING - "JBD: Unrecognised features on journal\n"); - return -EINVAL; - } - } - - /* Let the recovery code check whether it needs to recover any - * data from the journal. */ - if (journal_recover(journal)) - goto recovery_error; - - /* OK, we've finished with the dynamic journal bits: - * reinitialise the dynamic contents of the superblock in memory - * and reset them on disk. */ - if (journal_reset(journal)) - goto recovery_error; - - journal->j_flags &= ~JFS_ABORT; - journal->j_flags |= JFS_LOADED; - return 0; - -recovery_error: - printk (KERN_WARNING "JBD: recovery failed\n"); - return -EIO; -} - -/** - * void journal_destroy() - Release a journal_t structure. - * @journal: Journal to act on. - * - * Release a journal_t structure once it is no longer in use by the - * journaled object. - * Return <0 if we couldn't clean up the journal. - */ -int journal_destroy(journal_t *journal) -{ - int err = 0; - - - /* Wait for the commit thread to wake up and die. */ - journal_kill_thread(journal); - - /* Force a final log commit */ - if (journal->j_running_transaction) - journal_commit_transaction(journal); - - /* Force any old transactions to disk */ - - /* We cannot race with anybody but must keep assertions happy */ - mutex_lock(&journal->j_checkpoint_mutex); - /* Totally anal locking here... */ - spin_lock(&journal->j_list_lock); - while (journal->j_checkpoint_transactions != NULL) { - spin_unlock(&journal->j_list_lock); - log_do_checkpoint(journal); - spin_lock(&journal->j_list_lock); - } - - J_ASSERT(journal->j_running_transaction == NULL); - J_ASSERT(journal->j_committing_transaction == NULL); - J_ASSERT(journal->j_checkpoint_transactions == NULL); - spin_unlock(&journal->j_list_lock); - - if (journal->j_sb_buffer) { - if (!is_journal_aborted(journal)) { - journal->j_tail_sequence = - ++journal->j_transaction_sequence; - mark_journal_empty(journal); - } else - err = -EIO; - brelse(journal->j_sb_buffer); - } - mutex_unlock(&journal->j_checkpoint_mutex); - - iput(journal->j_inode); - if (journal->j_revoke) - journal_destroy_revoke(journal); - kfree(journal->j_wbuf); - kfree(journal); - - return err; -} - - -/** - *int journal_check_used_features () - Check if features specified are used. - * @journal: Journal to check. - * @compat: bitmask of compatible features - * @ro: bitmask of features that force read-only mount - * @incompat: bitmask of incompatible features - * - * Check whether the journal uses all of a given set of - * features. Return true (non-zero) if it does. - **/ - -int journal_check_used_features (journal_t *journal, unsigned long compat, - unsigned long ro, unsigned long incompat) -{ - journal_superblock_t *sb; - - if (!compat && !ro && !incompat) - return 1; - if (journal->j_format_version == 1) - return 0; - - sb = journal->j_superblock; - - if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) && - ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) && - ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat)) - return 1; - - return 0; -} - -/** - * int journal_check_available_features() - Check feature set in journalling layer - * @journal: Journal to check. - * @compat: bitmask of compatible features - * @ro: bitmask of features that force read-only mount - * @incompat: bitmask of incompatible features - * - * Check whether the journaling code supports the use of - * all of a given set of features on this journal. Return true - * (non-zero) if it can. */ - -int journal_check_available_features (journal_t *journal, unsigned long compat, - unsigned long ro, unsigned long incompat) -{ - if (!compat && !ro && !incompat) - return 1; - - /* We can support any known requested features iff the - * superblock is in version 2. Otherwise we fail to support any - * extended sb features. */ - - if (journal->j_format_version != 2) - return 0; - - if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat && - (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro && - (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat) - return 1; - - return 0; -} - -/** - * int journal_set_features () - Mark a given journal feature in the superblock - * @journal: Journal to act on. - * @compat: bitmask of compatible features - * @ro: bitmask of features that force read-only mount - * @incompat: bitmask of incompatible features - * - * Mark a given journal feature as present on the - * superblock. Returns true if the requested features could be set. - * - */ - -int journal_set_features (journal_t *journal, unsigned long compat, - unsigned long ro, unsigned long incompat) -{ - journal_superblock_t *sb; - - if (journal_check_used_features(journal, compat, ro, incompat)) - return 1; - - if (!journal_check_available_features(journal, compat, ro, incompat)) - return 0; - - jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", - compat, ro, incompat); - - sb = journal->j_superblock; - - sb->s_feature_compat |= cpu_to_be32(compat); - sb->s_feature_ro_compat |= cpu_to_be32(ro); - sb->s_feature_incompat |= cpu_to_be32(incompat); - - return 1; -} - - -/** - * int journal_update_format () - Update on-disk journal structure. - * @journal: Journal to act on. - * - * Given an initialised but unloaded journal struct, poke about in the - * on-disk structure to update it to the most recent supported version. - */ -int journal_update_format (journal_t *journal) -{ - journal_superblock_t *sb; - int err; - - err = journal_get_superblock(journal); - if (err) - return err; - - sb = journal->j_superblock; - - switch (be32_to_cpu(sb->s_header.h_blocktype)) { - case JFS_SUPERBLOCK_V2: - return 0; - case JFS_SUPERBLOCK_V1: - return journal_convert_superblock_v1(journal, sb); - default: - break; - } - return -EINVAL; -} - -static int journal_convert_superblock_v1(journal_t *journal, - journal_superblock_t *sb) -{ - int offset, blocksize; - struct buffer_head *bh; - - printk(KERN_WARNING - "JBD: Converting superblock from version 1 to 2.\n"); - - /* Pre-initialise new fields to zero */ - offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); - blocksize = be32_to_cpu(sb->s_blocksize); - memset(&sb->s_feature_compat, 0, blocksize-offset); - - sb->s_nr_users = cpu_to_be32(1); - sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); - journal->j_format_version = 2; - - bh = journal->j_sb_buffer; - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - sync_dirty_buffer(bh); - return 0; -} - - -/** - * int journal_flush () - Flush journal - * @journal: Journal to act on. - * - * Flush all data for a given journal to disk and empty the journal. - * Filesystems can use this when remounting readonly to ensure that - * recovery does not need to happen on remount. - */ - -int journal_flush(journal_t *journal) -{ - int err = 0; - transaction_t *transaction = NULL; - - spin_lock(&journal->j_state_lock); - - /* Force everything buffered to the log... */ - if (journal->j_running_transaction) { - transaction = journal->j_running_transaction; - __log_start_commit(journal, transaction->t_tid); - } else if (journal->j_committing_transaction) - transaction = journal->j_committing_transaction; - - /* Wait for the log commit to complete... */ - if (transaction) { - tid_t tid = transaction->t_tid; - - spin_unlock(&journal->j_state_lock); - log_wait_commit(journal, tid); - } else { - spin_unlock(&journal->j_state_lock); - } - - /* ...and flush everything in the log out to disk. */ - spin_lock(&journal->j_list_lock); - while (!err && journal->j_checkpoint_transactions != NULL) { - spin_unlock(&journal->j_list_lock); - mutex_lock(&journal->j_checkpoint_mutex); - err = log_do_checkpoint(journal); - mutex_unlock(&journal->j_checkpoint_mutex); - spin_lock(&journal->j_list_lock); - } - spin_unlock(&journal->j_list_lock); - - if (is_journal_aborted(journal)) - return -EIO; - - mutex_lock(&journal->j_checkpoint_mutex); - cleanup_journal_tail(journal); - - /* Finally, mark the journal as really needing no recovery. - * This sets s_start==0 in the underlying superblock, which is - * the magic code for a fully-recovered superblock. Any future - * commits of data to the journal will restore the current - * s_start value. */ - mark_journal_empty(journal); - mutex_unlock(&journal->j_checkpoint_mutex); - spin_lock(&journal->j_state_lock); - J_ASSERT(!journal->j_running_transaction); - J_ASSERT(!journal->j_committing_transaction); - J_ASSERT(!journal->j_checkpoint_transactions); - J_ASSERT(journal->j_head == journal->j_tail); - J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); - spin_unlock(&journal->j_state_lock); - return 0; -} - -/** - * int journal_wipe() - Wipe journal contents - * @journal: Journal to act on. - * @write: flag (see below) - * - * Wipe out all of the contents of a journal, safely. This will produce - * a warning if the journal contains any valid recovery information. - * Must be called between journal_init_*() and journal_load(). - * - * If 'write' is non-zero, then we wipe out the journal on disk; otherwise - * we merely suppress recovery. - */ - -int journal_wipe(journal_t *journal, int write) -{ - int err = 0; - - J_ASSERT (!(journal->j_flags & JFS_LOADED)); - - err = load_superblock(journal); - if (err) - return err; - - if (!journal->j_tail) - goto no_recovery; - - printk (KERN_WARNING "JBD: %s recovery information on journal\n", - write ? "Clearing" : "Ignoring"); - - err = journal_skip_recovery(journal); - if (write) { - /* Lock to make assertions happy... */ - mutex_lock(&journal->j_checkpoint_mutex); - mark_journal_empty(journal); - mutex_unlock(&journal->j_checkpoint_mutex); - } - - no_recovery: - return err; -} - -/* - * journal_dev_name: format a character string to describe on what - * device this journal is present. - */ - -static const char *journal_dev_name(journal_t *journal, char *buffer) -{ - struct block_device *bdev; - - if (journal->j_inode) - bdev = journal->j_inode->i_sb->s_bdev; - else - bdev = journal->j_dev; - - return bdevname(bdev, buffer); -} - -/* - * Journal abort has very specific semantics, which we describe - * for journal abort. - * - * Two internal function, which provide abort to te jbd layer - * itself are here. - */ - -/* - * Quick version for internal journal use (doesn't lock the journal). - * Aborts hard --- we mark the abort as occurred, but do _nothing_ else, - * and don't attempt to make any other journal updates. - */ -static void __journal_abort_hard(journal_t *journal) -{ - transaction_t *transaction; - char b[BDEVNAME_SIZE]; - - if (journal->j_flags & JFS_ABORT) - return; - - printk(KERN_ERR "Aborting journal on device %s.\n", - journal_dev_name(journal, b)); - - spin_lock(&journal->j_state_lock); - journal->j_flags |= JFS_ABORT; - transaction = journal->j_running_transaction; - if (transaction) - __log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); -} - -/* Soft abort: record the abort error status in the journal superblock, - * but don't do any other IO. */ -static void __journal_abort_soft (journal_t *journal, int errno) -{ - if (journal->j_flags & JFS_ABORT) - return; - - if (!journal->j_errno) - journal->j_errno = errno; - - __journal_abort_hard(journal); - - if (errno) - journal_update_sb_errno(journal); -} - -/** - * void journal_abort () - Shutdown the journal immediately. - * @journal: the journal to shutdown. - * @errno: an error number to record in the journal indicating - * the reason for the shutdown. - * - * Perform a complete, immediate shutdown of the ENTIRE - * journal (not of a single transaction). This operation cannot be - * undone without closing and reopening the journal. - * - * The journal_abort function is intended to support higher level error - * recovery mechanisms such as the ext2/ext3 remount-readonly error - * mode. - * - * Journal abort has very specific semantics. Any existing dirty, - * unjournaled buffers in the main filesystem will still be written to - * disk by bdflush, but the journaling mechanism will be suspended - * immediately and no further transaction commits will be honoured. - * - * Any dirty, journaled buffers will be written back to disk without - * hitting the journal. Atomicity cannot be guaranteed on an aborted - * filesystem, but we _do_ attempt to leave as much data as possible - * behind for fsck to use for cleanup. - * - * Any attempt to get a new transaction handle on a journal which is in - * ABORT state will just result in an -EROFS error return. A - * journal_stop on an existing handle will return -EIO if we have - * entered abort state during the update. - * - * Recursive transactions are not disturbed by journal abort until the - * final journal_stop, which will receive the -EIO error. - * - * Finally, the journal_abort call allows the caller to supply an errno - * which will be recorded (if possible) in the journal superblock. This - * allows a client to record failure conditions in the middle of a - * transaction without having to complete the transaction to record the - * failure to disk. ext3_error, for example, now uses this - * functionality. - * - * Errors which originate from within the journaling layer will NOT - * supply an errno; a null errno implies that absolutely no further - * writes are done to the journal (unless there are any already in - * progress). - * - */ - -void journal_abort(journal_t *journal, int errno) -{ - __journal_abort_soft(journal, errno); -} - -/** - * int journal_errno () - returns the journal's error state. - * @journal: journal to examine. - * - * This is the errno numbet set with journal_abort(), the last - * time the journal was mounted - if the journal was stopped - * without calling abort this will be 0. - * - * If the journal has been aborted on this mount time -EROFS will - * be returned. - */ -int journal_errno(journal_t *journal) -{ - int err; - - spin_lock(&journal->j_state_lock); - if (journal->j_flags & JFS_ABORT) - err = -EROFS; - else - err = journal->j_errno; - spin_unlock(&journal->j_state_lock); - return err; -} - -/** - * int journal_clear_err () - clears the journal's error state - * @journal: journal to act on. - * - * An error must be cleared or Acked to take a FS out of readonly - * mode. - */ -int journal_clear_err(journal_t *journal) -{ - int err = 0; - - spin_lock(&journal->j_state_lock); - if (journal->j_flags & JFS_ABORT) - err = -EROFS; - else - journal->j_errno = 0; - spin_unlock(&journal->j_state_lock); - return err; -} - -/** - * void journal_ack_err() - Ack journal err. - * @journal: journal to act on. - * - * An error must be cleared or Acked to take a FS out of readonly - * mode. - */ -void journal_ack_err(journal_t *journal) -{ - spin_lock(&journal->j_state_lock); - if (journal->j_errno) - journal->j_flags |= JFS_ACK_ERR; - spin_unlock(&journal->j_state_lock); -} - -int journal_blocks_per_page(struct inode *inode) -{ - return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); -} - -/* - * Journal_head storage management - */ -static struct kmem_cache *journal_head_cache; -#ifdef CONFIG_JBD_DEBUG -static atomic_t nr_journal_heads = ATOMIC_INIT(0); -#endif - -static int journal_init_journal_head_cache(void) -{ - int retval; - - J_ASSERT(journal_head_cache == NULL); - journal_head_cache = kmem_cache_create("journal_head", - sizeof(struct journal_head), - 0, /* offset */ - SLAB_TEMPORARY, /* flags */ - NULL); /* ctor */ - retval = 0; - if (!journal_head_cache) { - retval = -ENOMEM; - printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); - } - return retval; -} - -static void journal_destroy_journal_head_cache(void) -{ - if (journal_head_cache) { - kmem_cache_destroy(journal_head_cache); - journal_head_cache = NULL; - } -} - -/* - * journal_head splicing and dicing - */ -static struct journal_head *journal_alloc_journal_head(void) -{ - struct journal_head *ret; - -#ifdef CONFIG_JBD_DEBUG - atomic_inc(&nr_journal_heads); -#endif - ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS); - if (ret == NULL) { - jbd_debug(1, "out of memory for journal_head\n"); - printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n", - __func__); - - while (ret == NULL) { - yield(); - ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS); - } - } - return ret; -} - -static void journal_free_journal_head(struct journal_head *jh) -{ -#ifdef CONFIG_JBD_DEBUG - atomic_dec(&nr_journal_heads); - memset(jh, JBD_POISON_FREE, sizeof(*jh)); -#endif - kmem_cache_free(journal_head_cache, jh); -} - -/* - * A journal_head is attached to a buffer_head whenever JBD has an - * interest in the buffer. - * - * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit - * is set. This bit is tested in core kernel code where we need to take - * JBD-specific actions. Testing the zeroness of ->b_private is not reliable - * there. - * - * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one. - * - * When a buffer has its BH_JBD bit set it is immune from being released by - * core kernel code, mainly via ->b_count. - * - * A journal_head is detached from its buffer_head when the journal_head's - * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint - * transaction (b_cp_transaction) hold their references to b_jcount. - * - * Various places in the kernel want to attach a journal_head to a buffer_head - * _before_ attaching the journal_head to a transaction. To protect the - * journal_head in this situation, journal_add_journal_head elevates the - * journal_head's b_jcount refcount by one. The caller must call - * journal_put_journal_head() to undo this. - * - * So the typical usage would be: - * - * (Attach a journal_head if needed. Increments b_jcount) - * struct journal_head *jh = journal_add_journal_head(bh); - * ... - * (Get another reference for transaction) - * journal_grab_journal_head(bh); - * jh->b_transaction = xxx; - * (Put original reference) - * journal_put_journal_head(jh); - */ - -/* - * Give a buffer_head a journal_head. - * - * May sleep. - */ -struct journal_head *journal_add_journal_head(struct buffer_head *bh) -{ - struct journal_head *jh; - struct journal_head *new_jh = NULL; - -repeat: - if (!buffer_jbd(bh)) - new_jh = journal_alloc_journal_head(); - - jbd_lock_bh_journal_head(bh); - if (buffer_jbd(bh)) { - jh = bh2jh(bh); - } else { - J_ASSERT_BH(bh, - (atomic_read(&bh->b_count) > 0) || - (bh->b_page && bh->b_page->mapping)); - - if (!new_jh) { - jbd_unlock_bh_journal_head(bh); - goto repeat; - } - - jh = new_jh; - new_jh = NULL; /* We consumed it */ - set_buffer_jbd(bh); - bh->b_private = jh; - jh->b_bh = bh; - get_bh(bh); - BUFFER_TRACE(bh, "added journal_head"); - } - jh->b_jcount++; - jbd_unlock_bh_journal_head(bh); - if (new_jh) - journal_free_journal_head(new_jh); - return bh->b_private; -} - -/* - * Grab a ref against this buffer_head's journal_head. If it ended up not - * having a journal_head, return NULL - */ -struct journal_head *journal_grab_journal_head(struct buffer_head *bh) -{ - struct journal_head *jh = NULL; - - jbd_lock_bh_journal_head(bh); - if (buffer_jbd(bh)) { - jh = bh2jh(bh); - jh->b_jcount++; - } - jbd_unlock_bh_journal_head(bh); - return jh; -} - -static void __journal_remove_journal_head(struct buffer_head *bh) -{ - struct journal_head *jh = bh2jh(bh); - - J_ASSERT_JH(jh, jh->b_jcount >= 0); - J_ASSERT_JH(jh, jh->b_transaction == NULL); - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); - J_ASSERT_JH(jh, jh->b_jlist == BJ_None); - J_ASSERT_BH(bh, buffer_jbd(bh)); - J_ASSERT_BH(bh, jh2bh(jh) == bh); - BUFFER_TRACE(bh, "remove journal_head"); - if (jh->b_frozen_data) { - printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); - jbd_free(jh->b_frozen_data, bh->b_size); - } - if (jh->b_committed_data) { - printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); - jbd_free(jh->b_committed_data, bh->b_size); - } - bh->b_private = NULL; - jh->b_bh = NULL; /* debug, really */ - clear_buffer_jbd(bh); - journal_free_journal_head(jh); -} - -/* - * Drop a reference on the passed journal_head. If it fell to zero then - * release the journal_head from the buffer_head. - */ -void journal_put_journal_head(struct journal_head *jh) -{ - struct buffer_head *bh = jh2bh(jh); - - jbd_lock_bh_journal_head(bh); - J_ASSERT_JH(jh, jh->b_jcount > 0); - --jh->b_jcount; - if (!jh->b_jcount) { - __journal_remove_journal_head(bh); - jbd_unlock_bh_journal_head(bh); - __brelse(bh); - } else - jbd_unlock_bh_journal_head(bh); -} - -/* - * debugfs tunables - */ -#ifdef CONFIG_JBD_DEBUG - -u8 journal_enable_debug __read_mostly; -EXPORT_SYMBOL(journal_enable_debug); - -static struct dentry *jbd_debugfs_dir; -static struct dentry *jbd_debug; - -static void __init jbd_create_debugfs_entry(void) -{ - jbd_debugfs_dir = debugfs_create_dir("jbd", NULL); - if (jbd_debugfs_dir) - jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR, - jbd_debugfs_dir, - &journal_enable_debug); -} - -static void __exit jbd_remove_debugfs_entry(void) -{ - debugfs_remove(jbd_debug); - debugfs_remove(jbd_debugfs_dir); -} - -#else - -static inline void jbd_create_debugfs_entry(void) -{ -} - -static inline void jbd_remove_debugfs_entry(void) -{ -} - -#endif - -struct kmem_cache *jbd_handle_cache; - -static int __init journal_init_handle_cache(void) -{ - jbd_handle_cache = kmem_cache_create("journal_handle", - sizeof(handle_t), - 0, /* offset */ - SLAB_TEMPORARY, /* flags */ - NULL); /* ctor */ - if (jbd_handle_cache == NULL) { - printk(KERN_EMERG "JBD: failed to create handle cache\n"); - return -ENOMEM; - } - return 0; -} - -static void journal_destroy_handle_cache(void) -{ - if (jbd_handle_cache) - kmem_cache_destroy(jbd_handle_cache); -} - -/* - * Module startup and shutdown - */ - -static int __init journal_init_caches(void) -{ - int ret; - - ret = journal_init_revoke_caches(); - if (ret == 0) - ret = journal_init_journal_head_cache(); - if (ret == 0) - ret = journal_init_handle_cache(); - return ret; -} - -static void journal_destroy_caches(void) -{ - journal_destroy_revoke_caches(); - journal_destroy_journal_head_cache(); - journal_destroy_handle_cache(); -} - -static int __init journal_init(void) -{ - int ret; - - BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024); - - ret = journal_init_caches(); - if (ret != 0) - journal_destroy_caches(); - jbd_create_debugfs_entry(); - return ret; -} - -static void __exit journal_exit(void) -{ -#ifdef CONFIG_JBD_DEBUG - int n = atomic_read(&nr_journal_heads); - if (n) - printk(KERN_ERR "JBD: leaked %d journal_heads!\n", n); -#endif - jbd_remove_debugfs_entry(); - journal_destroy_caches(); -} - -MODULE_LICENSE("GPL"); -module_init(journal_init); -module_exit(journal_exit); - diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c deleted file mode 100644 index a748fe21465a..000000000000 --- a/fs/jbd/recovery.c +++ /dev/null @@ -1,594 +0,0 @@ -/* - * linux/fs/jbd/recovery.c - * - * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 - * - * Copyright 1999-2000 Red Hat Software --- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * Journal recovery routines for the generic filesystem journaling code; - * part of the ext2fs journaling system. - */ - -#ifndef __KERNEL__ -#include "jfs_user.h" -#else -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/errno.h> -#include <linux/blkdev.h> -#endif - -/* - * Maintain information about the progress of the recovery job, so that - * the different passes can carry information between them. - */ -struct recovery_info -{ - tid_t start_transaction; - tid_t end_transaction; - - int nr_replays; - int nr_revokes; - int nr_revoke_hits; -}; - -enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; -static int do_one_pass(journal_t *journal, - struct recovery_info *info, enum passtype pass); -static int scan_revoke_records(journal_t *, struct buffer_head *, - tid_t, struct recovery_info *); - -#ifdef __KERNEL__ - -/* Release readahead buffers after use */ -static void journal_brelse_array(struct buffer_head *b[], int n) -{ - while (--n >= 0) - brelse (b[n]); -} - - -/* - * When reading from the journal, we are going through the block device - * layer directly and so there is no readahead being done for us. We - * need to implement any readahead ourselves if we want it to happen at - * all. Recovery is basically one long sequential read, so make sure we - * do the IO in reasonably large chunks. - * - * This is not so critical that we need to be enormously clever about - * the readahead size, though. 128K is a purely arbitrary, good-enough - * fixed value. - */ - -#define MAXBUF 8 -static int do_readahead(journal_t *journal, unsigned int start) -{ - int err; - unsigned int max, nbufs, next; - unsigned int blocknr; - struct buffer_head *bh; - - struct buffer_head * bufs[MAXBUF]; - - /* Do up to 128K of readahead */ - max = start + (128 * 1024 / journal->j_blocksize); - if (max > journal->j_maxlen) - max = journal->j_maxlen; - - /* Do the readahead itself. We'll submit MAXBUF buffer_heads at - * a time to the block device IO layer. */ - - nbufs = 0; - - for (next = start; next < max; next++) { - err = journal_bmap(journal, next, &blocknr); - - if (err) { - printk (KERN_ERR "JBD: bad block at offset %u\n", - next); - goto failed; - } - - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - if (!bh) { - err = -ENOMEM; - goto failed; - } - - if (!buffer_uptodate(bh) && !buffer_locked(bh)) { - bufs[nbufs++] = bh; - if (nbufs == MAXBUF) { - ll_rw_block(READ, nbufs, bufs); - journal_brelse_array(bufs, nbufs); - nbufs = 0; - } - } else - brelse(bh); - } - - if (nbufs) - ll_rw_block(READ, nbufs, bufs); - err = 0; - -failed: - if (nbufs) - journal_brelse_array(bufs, nbufs); - return err; -} - -#endif /* __KERNEL__ */ - - -/* - * Read a block from the journal - */ - -static int jread(struct buffer_head **bhp, journal_t *journal, - unsigned int offset) -{ - int err; - unsigned int blocknr; - struct buffer_head *bh; - - *bhp = NULL; - - if (offset >= journal->j_maxlen) { - printk(KERN_ERR "JBD: corrupted journal superblock\n"); - return -EIO; - } - - err = journal_bmap(journal, offset, &blocknr); - - if (err) { - printk (KERN_ERR "JBD: bad block at offset %u\n", - offset); - return err; - } - - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - if (!bh) - return -ENOMEM; - - if (!buffer_uptodate(bh)) { - /* If this is a brand new buffer, start readahead. - Otherwise, we assume we are already reading it. */ - if (!buffer_req(bh)) - do_readahead(journal, offset); - wait_on_buffer(bh); - } - - if (!buffer_uptodate(bh)) { - printk (KERN_ERR "JBD: Failed to read block at offset %u\n", - offset); - brelse(bh); - return -EIO; - } - - *bhp = bh; - return 0; -} - - -/* - * Count the number of in-use tags in a journal descriptor block. - */ - -static int count_tags(struct buffer_head *bh, int size) -{ - char * tagp; - journal_block_tag_t * tag; - int nr = 0; - - tagp = &bh->b_data[sizeof(journal_header_t)]; - - while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) { - tag = (journal_block_tag_t *) tagp; - - nr++; - tagp += sizeof(journal_block_tag_t); - if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID))) - tagp += 16; - - if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG)) - break; - } - - return nr; -} - - -/* Make sure we wrap around the log correctly! */ -#define wrap(journal, var) \ -do { \ - if (var >= (journal)->j_last) \ - var -= ((journal)->j_last - (journal)->j_first); \ -} while (0) - -/** - * journal_recover - recovers a on-disk journal - * @journal: the journal to recover - * - * The primary function for recovering the log contents when mounting a - * journaled device. - * - * Recovery is done in three passes. In the first pass, we look for the - * end of the log. In the second, we assemble the list of revoke - * blocks. In the third and final pass, we replay any un-revoked blocks - * in the log. - */ -int journal_recover(journal_t *journal) -{ - int err, err2; - journal_superblock_t * sb; - - struct recovery_info info; - - memset(&info, 0, sizeof(info)); - sb = journal->j_superblock; - - /* - * The journal superblock's s_start field (the current log head) - * is always zero if, and only if, the journal was cleanly - * unmounted. - */ - - if (!sb->s_start) { - jbd_debug(1, "No recovery required, last transaction %d\n", - be32_to_cpu(sb->s_sequence)); - journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; - return 0; - } - - err = do_one_pass(journal, &info, PASS_SCAN); - if (!err) - err = do_one_pass(journal, &info, PASS_REVOKE); - if (!err) - err = do_one_pass(journal, &info, PASS_REPLAY); - - jbd_debug(1, "JBD: recovery, exit status %d, " - "recovered transactions %u to %u\n", - err, info.start_transaction, info.end_transaction); - jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n", - info.nr_replays, info.nr_revoke_hits, info.nr_revokes); - - /* Restart the log at the next transaction ID, thus invalidating - * any existing commit records in the log. */ - journal->j_transaction_sequence = ++info.end_transaction; - - journal_clear_revoke(journal); - err2 = sync_blockdev(journal->j_fs_dev); - if (!err) - err = err2; - /* Flush disk caches to get replayed data on the permanent storage */ - if (journal->j_flags & JFS_BARRIER) { - err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); - if (!err) - err = err2; - } - - return err; -} - -/** - * journal_skip_recovery - Start journal and wipe exiting records - * @journal: journal to startup - * - * Locate any valid recovery information from the journal and set up the - * journal structures in memory to ignore it (presumably because the - * caller has evidence that it is out of date). - * This function does'nt appear to be exorted.. - * - * We perform one pass over the journal to allow us to tell the user how - * much recovery information is being erased, and to let us initialise - * the journal transaction sequence numbers to the next unused ID. - */ -int journal_skip_recovery(journal_t *journal) -{ - int err; - struct recovery_info info; - - memset (&info, 0, sizeof(info)); - - err = do_one_pass(journal, &info, PASS_SCAN); - - if (err) { - printk(KERN_ERR "JBD: error %d scanning journal\n", err); - ++journal->j_transaction_sequence; - } else { -#ifdef CONFIG_JBD_DEBUG - int dropped = info.end_transaction - - be32_to_cpu(journal->j_superblock->s_sequence); - jbd_debug(1, - "JBD: ignoring %d transaction%s from the journal.\n", - dropped, (dropped == 1) ? "" : "s"); -#endif - journal->j_transaction_sequence = ++info.end_transaction; - } - - journal->j_tail = 0; - return err; -} - -static int do_one_pass(journal_t *journal, - struct recovery_info *info, enum passtype pass) -{ - unsigned int first_commit_ID, next_commit_ID; - unsigned int next_log_block; - int err, success = 0; - journal_superblock_t * sb; - journal_header_t * tmp; - struct buffer_head * bh; - unsigned int sequence; - int blocktype; - - /* - * First thing is to establish what we expect to find in the log - * (in terms of transaction IDs), and where (in terms of log - * block offsets): query the superblock. - */ - - sb = journal->j_superblock; - next_commit_ID = be32_to_cpu(sb->s_sequence); - next_log_block = be32_to_cpu(sb->s_start); - - first_commit_ID = next_commit_ID; - if (pass == PASS_SCAN) - info->start_transaction = first_commit_ID; - - jbd_debug(1, "Starting recovery pass %d\n", pass); - - /* - * Now we walk through the log, transaction by transaction, - * making sure that each transaction has a commit block in the - * expected place. Each complete transaction gets replayed back - * into the main filesystem. - */ - - while (1) { - int flags; - char * tagp; - journal_block_tag_t * tag; - struct buffer_head * obh; - struct buffer_head * nbh; - - cond_resched(); - - /* If we already know where to stop the log traversal, - * check right now that we haven't gone past the end of - * the log. */ - - if (pass != PASS_SCAN) - if (tid_geq(next_commit_ID, info->end_transaction)) - break; - - jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n", - next_commit_ID, next_log_block, journal->j_last); - - /* Skip over each chunk of the transaction looking - * either the next descriptor block or the final commit - * record. */ - - jbd_debug(3, "JBD: checking block %u\n", next_log_block); - err = jread(&bh, journal, next_log_block); - if (err) - goto failed; - - next_log_block++; - wrap(journal, next_log_block); - - /* What kind of buffer is it? - * - * If it is a descriptor block, check that it has the - * expected sequence number. Otherwise, we're all done - * here. */ - - tmp = (journal_header_t *)bh->b_data; - - if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) { - brelse(bh); - break; - } - - blocktype = be32_to_cpu(tmp->h_blocktype); - sequence = be32_to_cpu(tmp->h_sequence); - jbd_debug(3, "Found magic %d, sequence %d\n", - blocktype, sequence); - - if (sequence != next_commit_ID) { - brelse(bh); - break; - } - - /* OK, we have a valid descriptor block which matches - * all of the sequence number checks. What are we going - * to do with it? That depends on the pass... */ - - switch(blocktype) { - case JFS_DESCRIPTOR_BLOCK: - /* If it is a valid descriptor block, replay it - * in pass REPLAY; otherwise, just skip over the - * blocks it describes. */ - if (pass != PASS_REPLAY) { - next_log_block += - count_tags(bh, journal->j_blocksize); - wrap(journal, next_log_block); - brelse(bh); - continue; - } - - /* A descriptor block: we can now write all of - * the data blocks. Yay, useful work is finally - * getting done here! */ - - tagp = &bh->b_data[sizeof(journal_header_t)]; - while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) - <= journal->j_blocksize) { - unsigned int io_block; - - tag = (journal_block_tag_t *) tagp; - flags = be32_to_cpu(tag->t_flags); - - io_block = next_log_block++; - wrap(journal, next_log_block); - err = jread(&obh, journal, io_block); - if (err) { - /* Recover what we can, but - * report failure at the end. */ - success = err; - printk (KERN_ERR - "JBD: IO error %d recovering " - "block %u in log\n", - err, io_block); - } else { - unsigned int blocknr; - - J_ASSERT(obh != NULL); - blocknr = be32_to_cpu(tag->t_blocknr); - - /* If the block has been - * revoked, then we're all done - * here. */ - if (journal_test_revoke - (journal, blocknr, - next_commit_ID)) { - brelse(obh); - ++info->nr_revoke_hits; - goto skip_write; - } - - /* Find a buffer for the new - * data being restored */ - nbh = __getblk(journal->j_fs_dev, - blocknr, - journal->j_blocksize); - if (nbh == NULL) { - printk(KERN_ERR - "JBD: Out of memory " - "during recovery.\n"); - err = -ENOMEM; - brelse(bh); - brelse(obh); - goto failed; - } - - lock_buffer(nbh); - memcpy(nbh->b_data, obh->b_data, - journal->j_blocksize); - if (flags & JFS_FLAG_ESCAPE) { - *((__be32 *)nbh->b_data) = - cpu_to_be32(JFS_MAGIC_NUMBER); - } - - BUFFER_TRACE(nbh, "marking dirty"); - set_buffer_uptodate(nbh); - mark_buffer_dirty(nbh); - BUFFER_TRACE(nbh, "marking uptodate"); - ++info->nr_replays; - /* ll_rw_block(WRITE, 1, &nbh); */ - unlock_buffer(nbh); - brelse(obh); - brelse(nbh); - } - - skip_write: - tagp += sizeof(journal_block_tag_t); - if (!(flags & JFS_FLAG_SAME_UUID)) - tagp += 16; - - if (flags & JFS_FLAG_LAST_TAG) - break; - } - - brelse(bh); - continue; - - case JFS_COMMIT_BLOCK: - /* Found an expected commit block: not much to - * do other than move on to the next sequence - * number. */ - brelse(bh); - next_commit_ID++; - continue; - - case JFS_REVOKE_BLOCK: - /* If we aren't in the REVOKE pass, then we can - * just skip over this block. */ - if (pass != PASS_REVOKE) { - brelse(bh); - continue; - } - - err = scan_revoke_records(journal, bh, - next_commit_ID, info); - brelse(bh); - if (err) - goto failed; - continue; - - default: - jbd_debug(3, "Unrecognised magic %d, end of scan.\n", - blocktype); - brelse(bh); - goto done; - } - } - - done: - /* - * We broke out of the log scan loop: either we came to the - * known end of the log or we found an unexpected block in the - * log. If the latter happened, then we know that the "current" - * transaction marks the end of the valid log. - */ - - if (pass == PASS_SCAN) - info->end_transaction = next_commit_ID; - else { - /* It's really bad news if different passes end up at - * different places (but possible due to IO errors). */ - if (info->end_transaction != next_commit_ID) { - printk (KERN_ERR "JBD: recovery pass %d ended at " - "transaction %u, expected %u\n", - pass, next_commit_ID, info->end_transaction); - if (!success) - success = -EIO; - } - } - - return success; - - failed: - return err; -} - - -/* Scan a revoke record, marking all blocks mentioned as revoked. */ - -static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, - tid_t sequence, struct recovery_info *info) -{ - journal_revoke_header_t *header; - int offset, max; - - header = (journal_revoke_header_t *) bh->b_data; - offset = sizeof(journal_revoke_header_t); - max = be32_to_cpu(header->r_count); - - while (offset < max) { - unsigned int blocknr; - int err; - - blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); - offset += 4; - err = journal_set_revoke(journal, blocknr, sequence); - if (err) - return err; - ++info->nr_revokes; - } - return 0; -} diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c deleted file mode 100644 index dcead636c33b..000000000000 --- a/fs/jbd/revoke.c +++ /dev/null @@ -1,733 +0,0 @@ -/* - * linux/fs/jbd/revoke.c - * - * Written by Stephen C. Tweedie <sct@redhat.com>, 2000 - * - * Copyright 2000 Red Hat corp --- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * Journal revoke routines for the generic filesystem journaling code; - * part of the ext2fs journaling system. - * - * Revoke is the mechanism used to prevent old log records for deleted - * metadata from being replayed on top of newer data using the same - * blocks. The revoke mechanism is used in two separate places: - * - * + Commit: during commit we write the entire list of the current - * transaction's revoked blocks to the journal - * - * + Recovery: during recovery we record the transaction ID of all - * revoked blocks. If there are multiple revoke records in the log - * for a single block, only the last one counts, and if there is a log - * entry for a block beyond the last revoke, then that log entry still - * gets replayed. - * - * We can get interactions between revokes and new log data within a - * single transaction: - * - * Block is revoked and then journaled: - * The desired end result is the journaling of the new block, so we - * cancel the revoke before the transaction commits. - * - * Block is journaled and then revoked: - * The revoke must take precedence over the write of the block, so we - * need either to cancel the journal entry or to write the revoke - * later in the log than the log block. In this case, we choose the - * latter: journaling a block cancels any revoke record for that block - * in the current transaction, so any revoke for that block in the - * transaction must have happened after the block was journaled and so - * the revoke must take precedence. - * - * Block is revoked and then written as data: - * The data write is allowed to succeed, but the revoke is _not_ - * cancelled. We still need to prevent old log records from - * overwriting the new data. We don't even need to clear the revoke - * bit here. - * - * We cache revoke status of a buffer in the current transaction in b_states - * bits. As the name says, revokevalid flag indicates that the cached revoke - * status of a buffer is valid and we can rely on the cached status. - * - * Revoke information on buffers is a tri-state value: - * - * RevokeValid clear: no cached revoke status, need to look it up - * RevokeValid set, Revoked clear: - * buffer has not been revoked, and cancel_revoke - * need do nothing. - * RevokeValid set, Revoked set: - * buffer has been revoked. - * - * Locking rules: - * We keep two hash tables of revoke records. One hashtable belongs to the - * running transaction (is pointed to by journal->j_revoke), the other one - * belongs to the committing transaction. Accesses to the second hash table - * happen only from the kjournald and no other thread touches this table. Also - * journal_switch_revoke_table() which switches which hashtable belongs to the - * running and which to the committing transaction is called only from - * kjournald. Therefore we need no locks when accessing the hashtable belonging - * to the committing transaction. - * - * All users operating on the hash table belonging to the running transaction - * have a handle to the transaction. Therefore they are safe from kjournald - * switching hash tables under them. For operations on the lists of entries in - * the hash table j_revoke_lock is used. - * - * Finally, also replay code uses the hash tables but at this moment no one else - * can touch them (filesystem isn't mounted yet) and hence no locking is - * needed. - */ - -#ifndef __KERNEL__ -#include "jfs_user.h" -#else -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/list.h> -#include <linux/init.h> -#include <linux/bio.h> -#endif -#include <linux/log2.h> -#include <linux/hash.h> - -static struct kmem_cache *revoke_record_cache; -static struct kmem_cache *revoke_table_cache; - -/* Each revoke record represents one single revoked block. During - journal replay, this involves recording the transaction ID of the - last transaction to revoke this block. */ - -struct jbd_revoke_record_s -{ - struct list_head hash; - tid_t sequence; /* Used for recovery only */ - unsigned int blocknr; -}; - - -/* The revoke table is just a simple hash table of revoke records. */ -struct jbd_revoke_table_s -{ - /* It is conceivable that we might want a larger hash table - * for recovery. Must be a power of two. */ - int hash_size; - int hash_shift; - struct list_head *hash_table; -}; - - -#ifdef __KERNEL__ -static void write_one_revoke_record(journal_t *, transaction_t *, - struct journal_head **, int *, - struct jbd_revoke_record_s *, int); -static void flush_descriptor(journal_t *, struct journal_head *, int, int); -#endif - -/* Utility functions to maintain the revoke table */ - -static inline int hash(journal_t *journal, unsigned int block) -{ - struct jbd_revoke_table_s *table = journal->j_revoke; - - return hash_32(block, table->hash_shift); -} - -static int insert_revoke_hash(journal_t *journal, unsigned int blocknr, - tid_t seq) -{ - struct list_head *hash_list; - struct jbd_revoke_record_s *record; - -repeat: - record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS); - if (!record) - goto oom; - - record->sequence = seq; - record->blocknr = blocknr; - hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; - spin_lock(&journal->j_revoke_lock); - list_add(&record->hash, hash_list); - spin_unlock(&journal->j_revoke_lock); - return 0; - -oom: - if (!journal_oom_retry) - return -ENOMEM; - jbd_debug(1, "ENOMEM in %s, retrying\n", __func__); - yield(); - goto repeat; -} - -/* Find a revoke record in the journal's hash table. */ - -static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, - unsigned int blocknr) -{ - struct list_head *hash_list; - struct jbd_revoke_record_s *record; - - hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; - - spin_lock(&journal->j_revoke_lock); - record = (struct jbd_revoke_record_s *) hash_list->next; - while (&(record->hash) != hash_list) { - if (record->blocknr == blocknr) { - spin_unlock(&journal->j_revoke_lock); - return record; - } - record = (struct jbd_revoke_record_s *) record->hash.next; - } - spin_unlock(&journal->j_revoke_lock); - return NULL; -} - -void journal_destroy_revoke_caches(void) -{ - if (revoke_record_cache) { - kmem_cache_destroy(revoke_record_cache); - revoke_record_cache = NULL; - } - if (revoke_table_cache) { - kmem_cache_destroy(revoke_table_cache); - revoke_table_cache = NULL; - } -} - -int __init journal_init_revoke_caches(void) -{ - J_ASSERT(!revoke_record_cache); - J_ASSERT(!revoke_table_cache); - - revoke_record_cache = kmem_cache_create("revoke_record", - sizeof(struct jbd_revoke_record_s), - 0, - SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, - NULL); - if (!revoke_record_cache) - goto record_cache_failure; - - revoke_table_cache = kmem_cache_create("revoke_table", - sizeof(struct jbd_revoke_table_s), - 0, SLAB_TEMPORARY, NULL); - if (!revoke_table_cache) - goto table_cache_failure; - - return 0; - -table_cache_failure: - journal_destroy_revoke_caches(); -record_cache_failure: - return -ENOMEM; -} - -static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size) -{ - int i; - struct jbd_revoke_table_s *table; - - table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); - if (!table) - goto out; - - table->hash_size = hash_size; - table->hash_shift = ilog2(hash_size); - table->hash_table = - kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); - if (!table->hash_table) { - kmem_cache_free(revoke_table_cache, table); - table = NULL; - goto out; - } - - for (i = 0; i < hash_size; i++) - INIT_LIST_HEAD(&table->hash_table[i]); - -out: - return table; -} - -static void journal_destroy_revoke_table(struct jbd_revoke_table_s *table) -{ - int i; - struct list_head *hash_list; - - for (i = 0; i < table->hash_size; i++) { - hash_list = &table->hash_table[i]; - J_ASSERT(list_empty(hash_list)); - } - - kfree(table->hash_table); - kmem_cache_free(revoke_table_cache, table); -} - -/* Initialise the revoke table for a given journal to a given size. */ -int journal_init_revoke(journal_t *journal, int hash_size) -{ - J_ASSERT(journal->j_revoke_table[0] == NULL); - J_ASSERT(is_power_of_2(hash_size)); - - journal->j_revoke_table[0] = journal_init_revoke_table(hash_size); - if (!journal->j_revoke_table[0]) - goto fail0; - - journal->j_revoke_table[1] = journal_init_revoke_table(hash_size); - if (!journal->j_revoke_table[1]) - goto fail1; - - journal->j_revoke = journal->j_revoke_table[1]; - - spin_lock_init(&journal->j_revoke_lock); - - return 0; - -fail1: - journal_destroy_revoke_table(journal->j_revoke_table[0]); -fail0: - return -ENOMEM; -} - -/* Destroy a journal's revoke table. The table must already be empty! */ -void journal_destroy_revoke(journal_t *journal) -{ - journal->j_revoke = NULL; - if (journal->j_revoke_table[0]) - journal_destroy_revoke_table(journal->j_revoke_table[0]); - if (journal->j_revoke_table[1]) - journal_destroy_revoke_table(journal->j_revoke_table[1]); -} - - -#ifdef __KERNEL__ - -/* - * journal_revoke: revoke a given buffer_head from the journal. This - * prevents the block from being replayed during recovery if we take a - * crash after this current transaction commits. Any subsequent - * metadata writes of the buffer in this transaction cancel the - * revoke. - * - * Note that this call may block --- it is up to the caller to make - * sure that there are no further calls to journal_write_metadata - * before the revoke is complete. In ext3, this implies calling the - * revoke before clearing the block bitmap when we are deleting - * metadata. - * - * Revoke performs a journal_forget on any buffer_head passed in as a - * parameter, but does _not_ forget the buffer_head if the bh was only - * found implicitly. - * - * bh_in may not be a journalled buffer - it may have come off - * the hash tables without an attached journal_head. - * - * If bh_in is non-zero, journal_revoke() will decrement its b_count - * by one. - */ - -int journal_revoke(handle_t *handle, unsigned int blocknr, - struct buffer_head *bh_in) -{ - struct buffer_head *bh = NULL; - journal_t *journal; - struct block_device *bdev; - int err; - - might_sleep(); - if (bh_in) - BUFFER_TRACE(bh_in, "enter"); - - journal = handle->h_transaction->t_journal; - if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){ - J_ASSERT (!"Cannot set revoke feature!"); - return -EINVAL; - } - - bdev = journal->j_fs_dev; - bh = bh_in; - - if (!bh) { - bh = __find_get_block(bdev, blocknr, journal->j_blocksize); - if (bh) - BUFFER_TRACE(bh, "found on hash"); - } -#ifdef JBD_EXPENSIVE_CHECKING - else { - struct buffer_head *bh2; - - /* If there is a different buffer_head lying around in - * memory anywhere... */ - bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize); - if (bh2) { - /* ... and it has RevokeValid status... */ - if (bh2 != bh && buffer_revokevalid(bh2)) - /* ...then it better be revoked too, - * since it's illegal to create a revoke - * record against a buffer_head which is - * not marked revoked --- that would - * risk missing a subsequent revoke - * cancel. */ - J_ASSERT_BH(bh2, buffer_revoked(bh2)); - put_bh(bh2); - } - } -#endif - - /* We really ought not ever to revoke twice in a row without - first having the revoke cancelled: it's illegal to free a - block twice without allocating it in between! */ - if (bh) { - if (!J_EXPECT_BH(bh, !buffer_revoked(bh), - "inconsistent data on disk")) { - if (!bh_in) - brelse(bh); - return -EIO; - } - set_buffer_revoked(bh); - set_buffer_revokevalid(bh); - if (bh_in) { - BUFFER_TRACE(bh_in, "call journal_forget"); - journal_forget(handle, bh_in); - } else { - BUFFER_TRACE(bh, "call brelse"); - __brelse(bh); - } - } - - jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in); - err = insert_revoke_hash(journal, blocknr, - handle->h_transaction->t_tid); - BUFFER_TRACE(bh_in, "exit"); - return err; -} - -/* - * Cancel an outstanding revoke. For use only internally by the - * journaling code (called from journal_get_write_access). - * - * We trust buffer_revoked() on the buffer if the buffer is already - * being journaled: if there is no revoke pending on the buffer, then we - * don't do anything here. - * - * This would break if it were possible for a buffer to be revoked and - * discarded, and then reallocated within the same transaction. In such - * a case we would have lost the revoked bit, but when we arrived here - * the second time we would still have a pending revoke to cancel. So, - * do not trust the Revoked bit on buffers unless RevokeValid is also - * set. - */ -int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) -{ - struct jbd_revoke_record_s *record; - journal_t *journal = handle->h_transaction->t_journal; - int need_cancel; - int did_revoke = 0; /* akpm: debug */ - struct buffer_head *bh = jh2bh(jh); - - jbd_debug(4, "journal_head %p, cancelling revoke\n", jh); - - /* Is the existing Revoke bit valid? If so, we trust it, and - * only perform the full cancel if the revoke bit is set. If - * not, we can't trust the revoke bit, and we need to do the - * full search for a revoke record. */ - if (test_set_buffer_revokevalid(bh)) { - need_cancel = test_clear_buffer_revoked(bh); - } else { - need_cancel = 1; - clear_buffer_revoked(bh); - } - - if (need_cancel) { - record = find_revoke_record(journal, bh->b_blocknr); - if (record) { - jbd_debug(4, "cancelled existing revoke on " - "blocknr %llu\n", (unsigned long long)bh->b_blocknr); - spin_lock(&journal->j_revoke_lock); - list_del(&record->hash); - spin_unlock(&journal->j_revoke_lock); - kmem_cache_free(revoke_record_cache, record); - did_revoke = 1; - } - } - -#ifdef JBD_EXPENSIVE_CHECKING - /* There better not be one left behind by now! */ - record = find_revoke_record(journal, bh->b_blocknr); - J_ASSERT_JH(jh, record == NULL); -#endif - - /* Finally, have we just cleared revoke on an unhashed - * buffer_head? If so, we'd better make sure we clear the - * revoked status on any hashed alias too, otherwise the revoke - * state machine will get very upset later on. */ - if (need_cancel) { - struct buffer_head *bh2; - bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size); - if (bh2) { - if (bh2 != bh) - clear_buffer_revoked(bh2); - __brelse(bh2); - } - } - return did_revoke; -} - -/* - * journal_clear_revoked_flags clears revoked flag of buffers in - * revoke table to reflect there is no revoked buffer in the next - * transaction which is going to be started. - */ -void journal_clear_buffer_revoked_flags(journal_t *journal) -{ - struct jbd_revoke_table_s *revoke = journal->j_revoke; - int i = 0; - - for (i = 0; i < revoke->hash_size; i++) { - struct list_head *hash_list; - struct list_head *list_entry; - hash_list = &revoke->hash_table[i]; - - list_for_each(list_entry, hash_list) { - struct jbd_revoke_record_s *record; - struct buffer_head *bh; - record = (struct jbd_revoke_record_s *)list_entry; - bh = __find_get_block(journal->j_fs_dev, - record->blocknr, - journal->j_blocksize); - if (bh) { - clear_buffer_revoked(bh); - __brelse(bh); - } - } - } -} - -/* journal_switch_revoke table select j_revoke for next transaction - * we do not want to suspend any processing until all revokes are - * written -bzzz - */ -void journal_switch_revoke_table(journal_t *journal) -{ - int i; - - if (journal->j_revoke == journal->j_revoke_table[0]) - journal->j_revoke = journal->j_revoke_table[1]; - else - journal->j_revoke = journal->j_revoke_table[0]; - - for (i = 0; i < journal->j_revoke->hash_size; i++) - INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); -} - -/* - * Write revoke records to the journal for all entries in the current - * revoke hash, deleting the entries as we go. - */ -void journal_write_revoke_records(journal_t *journal, - transaction_t *transaction, int write_op) -{ - struct journal_head *descriptor; - struct jbd_revoke_record_s *record; - struct jbd_revoke_table_s *revoke; - struct list_head *hash_list; - int i, offset, count; - - descriptor = NULL; - offset = 0; - count = 0; - - /* select revoke table for committing transaction */ - revoke = journal->j_revoke == journal->j_revoke_table[0] ? - journal->j_revoke_table[1] : journal->j_revoke_table[0]; - - for (i = 0; i < revoke->hash_size; i++) { - hash_list = &revoke->hash_table[i]; - - while (!list_empty(hash_list)) { - record = (struct jbd_revoke_record_s *) - hash_list->next; - write_one_revoke_record(journal, transaction, - &descriptor, &offset, - record, write_op); - count++; - list_del(&record->hash); - kmem_cache_free(revoke_record_cache, record); - } - } - if (descriptor) - flush_descriptor(journal, descriptor, offset, write_op); - jbd_debug(1, "Wrote %d revoke records\n", count); -} - -/* - * Write out one revoke record. We need to create a new descriptor - * block if the old one is full or if we have not already created one. - */ - -static void write_one_revoke_record(journal_t *journal, - transaction_t *transaction, - struct journal_head **descriptorp, - int *offsetp, - struct jbd_revoke_record_s *record, - int write_op) -{ - struct journal_head *descriptor; - int offset; - journal_header_t *header; - - /* If we are already aborting, this all becomes a noop. We - still need to go round the loop in - journal_write_revoke_records in order to free all of the - revoke records: only the IO to the journal is omitted. */ - if (is_journal_aborted(journal)) - return; - - descriptor = *descriptorp; - offset = *offsetp; - - /* Make sure we have a descriptor with space left for the record */ - if (descriptor) { - if (offset == journal->j_blocksize) { - flush_descriptor(journal, descriptor, offset, write_op); - descriptor = NULL; - } - } - - if (!descriptor) { - descriptor = journal_get_descriptor_buffer(journal); - if (!descriptor) - return; - header = (journal_header_t *) &jh2bh(descriptor)->b_data[0]; - header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); - header->h_blocktype = cpu_to_be32(JFS_REVOKE_BLOCK); - header->h_sequence = cpu_to_be32(transaction->t_tid); - - /* Record it so that we can wait for IO completion later */ - JBUFFER_TRACE(descriptor, "file as BJ_LogCtl"); - journal_file_buffer(descriptor, transaction, BJ_LogCtl); - - offset = sizeof(journal_revoke_header_t); - *descriptorp = descriptor; - } - - * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = - cpu_to_be32(record->blocknr); - offset += 4; - *offsetp = offset; -} - -/* - * Flush a revoke descriptor out to the journal. If we are aborting, - * this is a noop; otherwise we are generating a buffer which needs to - * be waited for during commit, so it has to go onto the appropriate - * journal buffer list. - */ - -static void flush_descriptor(journal_t *journal, - struct journal_head *descriptor, - int offset, int write_op) -{ - journal_revoke_header_t *header; - struct buffer_head *bh = jh2bh(descriptor); - - if (is_journal_aborted(journal)) { - put_bh(bh); - return; - } - - header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data; - header->r_count = cpu_to_be32(offset); - set_buffer_jwrite(bh); - BUFFER_TRACE(bh, "write"); - set_buffer_dirty(bh); - write_dirty_buffer(bh, write_op); -} -#endif - -/* - * Revoke support for recovery. - * - * Recovery needs to be able to: - * - * record all revoke records, including the tid of the latest instance - * of each revoke in the journal - * - * check whether a given block in a given transaction should be replayed - * (ie. has not been revoked by a revoke record in that or a subsequent - * transaction) - * - * empty the revoke table after recovery. - */ - -/* - * First, setting revoke records. We create a new revoke record for - * every block ever revoked in the log as we scan it for recovery, and - * we update the existing records if we find multiple revokes for a - * single block. - */ - -int journal_set_revoke(journal_t *journal, - unsigned int blocknr, - tid_t sequence) -{ - struct jbd_revoke_record_s *record; - - record = find_revoke_record(journal, blocknr); - if (record) { - /* If we have multiple occurrences, only record the - * latest sequence number in the hashed record */ - if (tid_gt(sequence, record->sequence)) - record->sequence = sequence; - return 0; - } - return insert_revoke_hash(journal, blocknr, sequence); -} - -/* - * Test revoke records. For a given block referenced in the log, has - * that block been revoked? A revoke record with a given transaction - * sequence number revokes all blocks in that transaction and earlier - * ones, but later transactions still need replayed. - */ - -int journal_test_revoke(journal_t *journal, - unsigned int blocknr, - tid_t sequence) -{ - struct jbd_revoke_record_s *record; - - record = find_revoke_record(journal, blocknr); - if (!record) - return 0; - if (tid_gt(sequence, record->sequence)) - return 0; - return 1; -} - -/* - * Finally, once recovery is over, we need to clear the revoke table so - * that it can be reused by the running filesystem. - */ - -void journal_clear_revoke(journal_t *journal) -{ - int i; - struct list_head *hash_list; - struct jbd_revoke_record_s *record; - struct jbd_revoke_table_s *revoke; - - revoke = journal->j_revoke; - - for (i = 0; i < revoke->hash_size; i++) { - hash_list = &revoke->hash_table[i]; - while (!list_empty(hash_list)) { - record = (struct jbd_revoke_record_s*) hash_list->next; - list_del(&record->hash); - kmem_cache_free(revoke_record_cache, record); - } - } -} diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c deleted file mode 100644 index 1695ba8334a2..000000000000 --- a/fs/jbd/transaction.c +++ /dev/null @@ -1,2237 +0,0 @@ -/* - * linux/fs/jbd/transaction.c - * - * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 - * - * Copyright 1998 Red Hat corp --- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * Generic filesystem transaction handling code; part of the ext2fs - * journaling system. - * - * This file manages transactions (compound commits managed by the - * journaling code) and handles (individual atomic operations by the - * filesystem). - */ - -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/jbd.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/timer.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/hrtimer.h> - -static void __journal_temp_unlink_buffer(struct journal_head *jh); - -/* - * get_transaction: obtain a new transaction_t object. - * - * Simply allocate and initialise a new transaction. Create it in - * RUNNING state and add it to the current journal (which should not - * have an existing running transaction: we only make a new transaction - * once we have started to commit the old one). - * - * Preconditions: - * The journal MUST be locked. We don't perform atomic mallocs on the - * new transaction and we can't block without protecting against other - * processes trying to touch the journal while it is in transition. - * - * Called under j_state_lock - */ - -static transaction_t * -get_transaction(journal_t *journal, transaction_t *transaction) -{ - transaction->t_journal = journal; - transaction->t_state = T_RUNNING; - transaction->t_start_time = ktime_get(); - transaction->t_tid = journal->j_transaction_sequence++; - transaction->t_expires = jiffies + journal->j_commit_interval; - spin_lock_init(&transaction->t_handle_lock); - - /* Set up the commit timer for the new transaction. */ - journal->j_commit_timer.expires = - round_jiffies_up(transaction->t_expires); - add_timer(&journal->j_commit_timer); - - J_ASSERT(journal->j_running_transaction == NULL); - journal->j_running_transaction = transaction; - - return transaction; -} - -/* - * Handle management. - * - * A handle_t is an object which represents a single atomic update to a - * filesystem, and which tracks all of the modifications which form part - * of that one update. - */ - -/* - * start_this_handle: Given a handle, deal with any locking or stalling - * needed to make sure that there is enough journal space for the handle - * to begin. Attach the handle to a transaction and set up the - * transaction's buffer credits. - */ - -static int start_this_handle(journal_t *journal, handle_t *handle) -{ - transaction_t *transaction; - int needed; - int nblocks = handle->h_buffer_credits; - transaction_t *new_transaction = NULL; - int ret = 0; - - if (nblocks > journal->j_max_transaction_buffers) { - printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", - current->comm, nblocks, - journal->j_max_transaction_buffers); - ret = -ENOSPC; - goto out; - } - -alloc_transaction: - if (!journal->j_running_transaction) { - new_transaction = kzalloc(sizeof(*new_transaction), - GFP_NOFS|__GFP_NOFAIL); - if (!new_transaction) { - ret = -ENOMEM; - goto out; - } - } - - jbd_debug(3, "New handle %p going live.\n", handle); - -repeat: - - /* - * We need to hold j_state_lock until t_updates has been incremented, - * for proper journal barrier handling - */ - spin_lock(&journal->j_state_lock); -repeat_locked: - if (is_journal_aborted(journal) || - (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) { - spin_unlock(&journal->j_state_lock); - ret = -EROFS; - goto out; - } - - /* Wait on the journal's transaction barrier if necessary */ - if (journal->j_barrier_count) { - spin_unlock(&journal->j_state_lock); - wait_event(journal->j_wait_transaction_locked, - journal->j_barrier_count == 0); - goto repeat; - } - - if (!journal->j_running_transaction) { - if (!new_transaction) { - spin_unlock(&journal->j_state_lock); - goto alloc_transaction; - } - get_transaction(journal, new_transaction); - new_transaction = NULL; - } - - transaction = journal->j_running_transaction; - - /* - * If the current transaction is locked down for commit, wait for the - * lock to be released. - */ - if (transaction->t_state == T_LOCKED) { - DEFINE_WAIT(wait); - - prepare_to_wait(&journal->j_wait_transaction_locked, - &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&journal->j_state_lock); - schedule(); - finish_wait(&journal->j_wait_transaction_locked, &wait); - goto repeat; - } - - /* - * If there is not enough space left in the log to write all potential - * buffers requested by this operation, we need to stall pending a log - * checkpoint to free some more log space. - */ - spin_lock(&transaction->t_handle_lock); - needed = transaction->t_outstanding_credits + nblocks; - - if (needed > journal->j_max_transaction_buffers) { - /* - * If the current transaction is already too large, then start - * to commit it: we can then go back and attach this handle to - * a new transaction. - */ - DEFINE_WAIT(wait); - - jbd_debug(2, "Handle %p starting new commit...\n", handle); - spin_unlock(&transaction->t_handle_lock); - prepare_to_wait(&journal->j_wait_transaction_locked, &wait, - TASK_UNINTERRUPTIBLE); - __log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); - schedule(); - finish_wait(&journal->j_wait_transaction_locked, &wait); - goto repeat; - } - - /* - * The commit code assumes that it can get enough log space - * without forcing a checkpoint. This is *critical* for - * correctness: a checkpoint of a buffer which is also - * associated with a committing transaction creates a deadlock, - * so commit simply cannot force through checkpoints. - * - * We must therefore ensure the necessary space in the journal - * *before* starting to dirty potentially checkpointed buffers - * in the new transaction. - * - * The worst part is, any transaction currently committing can - * reduce the free space arbitrarily. Be careful to account for - * those buffers when checkpointing. - */ - - /* - * @@@ AKPM: This seems rather over-defensive. We're giving commit - * a _lot_ of headroom: 1/4 of the journal plus the size of - * the committing transaction. Really, we only need to give it - * committing_transaction->t_outstanding_credits plus "enough" for - * the log control blocks. - * Also, this test is inconsistent with the matching one in - * journal_extend(). - */ - if (__log_space_left(journal) < jbd_space_needed(journal)) { - jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); - spin_unlock(&transaction->t_handle_lock); - __log_wait_for_space(journal); - goto repeat_locked; - } - - /* OK, account for the buffers that this operation expects to - * use and add the handle to the running transaction. */ - - handle->h_transaction = transaction; - transaction->t_outstanding_credits += nblocks; - transaction->t_updates++; - transaction->t_handle_count++; - jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", - handle, nblocks, transaction->t_outstanding_credits, - __log_space_left(journal)); - spin_unlock(&transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); - - lock_map_acquire(&handle->h_lockdep_map); -out: - if (unlikely(new_transaction)) /* It's usually NULL */ - kfree(new_transaction); - return ret; -} - -static struct lock_class_key jbd_handle_key; - -/* Allocate a new handle. This should probably be in a slab... */ -static handle_t *new_handle(int nblocks) -{ - handle_t *handle = jbd_alloc_handle(GFP_NOFS); - if (!handle) - return NULL; - handle->h_buffer_credits = nblocks; - handle->h_ref = 1; - - lockdep_init_map(&handle->h_lockdep_map, "jbd_handle", &jbd_handle_key, 0); - - return handle; -} - -/** - * handle_t *journal_start() - Obtain a new handle. - * @journal: Journal to start transaction on. - * @nblocks: number of block buffer we might modify - * - * We make sure that the transaction can guarantee at least nblocks of - * modified buffers in the log. We block until the log can guarantee - * that much space. - * - * This function is visible to journal users (like ext3fs), so is not - * called with the journal already locked. - * - * Return a pointer to a newly allocated handle, or an ERR_PTR() value - * on failure. - */ -handle_t *journal_start(journal_t *journal, int nblocks) -{ - handle_t *handle = journal_current_handle(); - int err; - - if (!journal) - return ERR_PTR(-EROFS); - - if (handle) { - J_ASSERT(handle->h_transaction->t_journal == journal); - handle->h_ref++; - return handle; - } - - handle = new_handle(nblocks); - if (!handle) - return ERR_PTR(-ENOMEM); - - current->journal_info = handle; - - err = start_this_handle(journal, handle); - if (err < 0) { - jbd_free_handle(handle); - current->journal_info = NULL; - handle = ERR_PTR(err); - } - return handle; -} - -/** - * int journal_extend() - extend buffer credits. - * @handle: handle to 'extend' - * @nblocks: nr blocks to try to extend by. - * - * Some transactions, such as large extends and truncates, can be done - * atomically all at once or in several stages. The operation requests - * a credit for a number of buffer modications in advance, but can - * extend its credit if it needs more. - * - * journal_extend tries to give the running handle more buffer credits. - * It does not guarantee that allocation - this is a best-effort only. - * The calling process MUST be able to deal cleanly with a failure to - * extend here. - * - * Return 0 on success, non-zero on failure. - * - * return code < 0 implies an error - * return code > 0 implies normal transaction-full status. - */ -int journal_extend(handle_t *handle, int nblocks) -{ - transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; - int result; - int wanted; - - result = -EIO; - if (is_handle_aborted(handle)) - goto out; - - result = 1; - - spin_lock(&journal->j_state_lock); - - /* Don't extend a locked-down transaction! */ - if (handle->h_transaction->t_state != T_RUNNING) { - jbd_debug(3, "denied handle %p %d blocks: " - "transaction not running\n", handle, nblocks); - goto error_out; - } - - spin_lock(&transaction->t_handle_lock); - wanted = transaction->t_outstanding_credits + nblocks; - - if (wanted > journal->j_max_transaction_buffers) { - jbd_debug(3, "denied handle %p %d blocks: " - "transaction too large\n", handle, nblocks); - goto unlock; - } - - if (wanted > __log_space_left(journal)) { - jbd_debug(3, "denied handle %p %d blocks: " - "insufficient log space\n", handle, nblocks); - goto unlock; - } - - handle->h_buffer_credits += nblocks; - transaction->t_outstanding_credits += nblocks; - result = 0; - - jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); -unlock: - spin_unlock(&transaction->t_handle_lock); -error_out: - spin_unlock(&journal->j_state_lock); -out: - return result; -} - - -/** - * int journal_restart() - restart a handle. - * @handle: handle to restart - * @nblocks: nr credits requested - * - * Restart a handle for a multi-transaction filesystem - * operation. - * - * If the journal_extend() call above fails to grant new buffer credits - * to a running handle, a call to journal_restart will commit the - * handle's transaction so far and reattach the handle to a new - * transaction capabable of guaranteeing the requested number of - * credits. - */ - -int journal_restart(handle_t *handle, int nblocks) -{ - transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; - int ret; - - /* If we've had an abort of any type, don't even think about - * actually doing the restart! */ - if (is_handle_aborted(handle)) - return 0; - - /* - * First unlink the handle from its current transaction, and start the - * commit on that. - */ - J_ASSERT(transaction->t_updates > 0); - J_ASSERT(journal_current_handle() == handle); - - spin_lock(&journal->j_state_lock); - spin_lock(&transaction->t_handle_lock); - transaction->t_outstanding_credits -= handle->h_buffer_credits; - transaction->t_updates--; - - if (!transaction->t_updates) - wake_up(&journal->j_wait_updates); - spin_unlock(&transaction->t_handle_lock); - - jbd_debug(2, "restarting handle %p\n", handle); - __log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); - - lock_map_release(&handle->h_lockdep_map); - handle->h_buffer_credits = nblocks; - ret = start_this_handle(journal, handle); - return ret; -} - - -/** - * void journal_lock_updates () - establish a transaction barrier. - * @journal: Journal to establish a barrier on. - * - * This locks out any further updates from being started, and blocks until all - * existing updates have completed, returning only once the journal is in a - * quiescent state with no updates running. - * - * We do not use simple mutex for synchronization as there are syscalls which - * want to return with filesystem locked and that trips up lockdep. Also - * hibernate needs to lock filesystem but locked mutex then blocks hibernation. - * Since locking filesystem is rare operation, we use simple counter and - * waitqueue for locking. - */ -void journal_lock_updates(journal_t *journal) -{ - DEFINE_WAIT(wait); - -wait: - /* Wait for previous locked operation to finish */ - wait_event(journal->j_wait_transaction_locked, - journal->j_barrier_count == 0); - - spin_lock(&journal->j_state_lock); - /* - * Check reliably under the lock whether we are the ones winning the race - * and locking the journal - */ - if (journal->j_barrier_count > 0) { - spin_unlock(&journal->j_state_lock); - goto wait; - } - ++journal->j_barrier_count; - - /* Wait until there are no running updates */ - while (1) { - transaction_t *transaction = journal->j_running_transaction; - - if (!transaction) - break; - - spin_lock(&transaction->t_handle_lock); - if (!transaction->t_updates) { - spin_unlock(&transaction->t_handle_lock); - break; - } - prepare_to_wait(&journal->j_wait_updates, &wait, - TASK_UNINTERRUPTIBLE); - spin_unlock(&transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); - schedule(); - finish_wait(&journal->j_wait_updates, &wait); - spin_lock(&journal->j_state_lock); - } - spin_unlock(&journal->j_state_lock); -} - -/** - * void journal_unlock_updates (journal_t* journal) - release barrier - * @journal: Journal to release the barrier on. - * - * Release a transaction barrier obtained with journal_lock_updates(). - */ -void journal_unlock_updates (journal_t *journal) -{ - J_ASSERT(journal->j_barrier_count != 0); - - spin_lock(&journal->j_state_lock); - --journal->j_barrier_count; - spin_unlock(&journal->j_state_lock); - wake_up(&journal->j_wait_transaction_locked); -} - -static void warn_dirty_buffer(struct buffer_head *bh) -{ - char b[BDEVNAME_SIZE]; - - printk(KERN_WARNING - "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " - "There's a risk of filesystem corruption in case of system " - "crash.\n", - bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); -} - -/* - * If the buffer is already part of the current transaction, then there - * is nothing we need to do. If it is already part of a prior - * transaction which we are still committing to disk, then we need to - * make sure that we do not overwrite the old copy: we do copy-out to - * preserve the copy going to disk. We also account the buffer against - * the handle's metadata buffer credits (unless the buffer is already - * part of the transaction, that is). - * - */ -static int -do_get_write_access(handle_t *handle, struct journal_head *jh, - int force_copy) -{ - struct buffer_head *bh; - transaction_t *transaction; - journal_t *journal; - int error; - char *frozen_buffer = NULL; - int need_copy = 0; - - if (is_handle_aborted(handle)) - return -EROFS; - - transaction = handle->h_transaction; - journal = transaction->t_journal; - - jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); - - JBUFFER_TRACE(jh, "entry"); -repeat: - bh = jh2bh(jh); - - /* @@@ Need to check for errors here at some point. */ - - lock_buffer(bh); - jbd_lock_bh_state(bh); - - /* We now hold the buffer lock so it is safe to query the buffer - * state. Is the buffer dirty? - * - * If so, there are two possibilities. The buffer may be - * non-journaled, and undergoing a quite legitimate writeback. - * Otherwise, it is journaled, and we don't expect dirty buffers - * in that state (the buffers should be marked JBD_Dirty - * instead.) So either the IO is being done under our own - * control and this is a bug, or it's a third party IO such as - * dump(8) (which may leave the buffer scheduled for read --- - * ie. locked but not dirty) or tune2fs (which may actually have - * the buffer dirtied, ugh.) */ - - if (buffer_dirty(bh)) { - /* - * First question: is this buffer already part of the current - * transaction or the existing committing transaction? - */ - if (jh->b_transaction) { - J_ASSERT_JH(jh, - jh->b_transaction == transaction || - jh->b_transaction == - journal->j_committing_transaction); - if (jh->b_next_transaction) - J_ASSERT_JH(jh, jh->b_next_transaction == - transaction); - warn_dirty_buffer(bh); - } - /* - * In any case we need to clean the dirty flag and we must - * do it under the buffer lock to be sure we don't race - * with running write-out. - */ - JBUFFER_TRACE(jh, "Journalling dirty buffer"); - clear_buffer_dirty(bh); - set_buffer_jbddirty(bh); - } - - unlock_buffer(bh); - - error = -EROFS; - if (is_handle_aborted(handle)) { - jbd_unlock_bh_state(bh); - goto out; - } - error = 0; - - /* - * The buffer is already part of this transaction if b_transaction or - * b_next_transaction points to it - */ - if (jh->b_transaction == transaction || - jh->b_next_transaction == transaction) - goto done; - - /* - * this is the first time this transaction is touching this buffer, - * reset the modified flag - */ - jh->b_modified = 0; - - /* - * If there is already a copy-out version of this buffer, then we don't - * need to make another one - */ - if (jh->b_frozen_data) { - JBUFFER_TRACE(jh, "has frozen data"); - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - jh->b_next_transaction = transaction; - goto done; - } - - /* Is there data here we need to preserve? */ - - if (jh->b_transaction && jh->b_transaction != transaction) { - JBUFFER_TRACE(jh, "owned by older transaction"); - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_committing_transaction); - - /* There is one case we have to be very careful about. - * If the committing transaction is currently writing - * this buffer out to disk and has NOT made a copy-out, - * then we cannot modify the buffer contents at all - * right now. The essence of copy-out is that it is the - * extra copy, not the primary copy, which gets - * journaled. If the primary copy is already going to - * disk then we cannot do copy-out here. */ - - if (jh->b_jlist == BJ_Shadow) { - DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow); - wait_queue_head_t *wqh; - - wqh = bit_waitqueue(&bh->b_state, BH_Unshadow); - - JBUFFER_TRACE(jh, "on shadow: sleep"); - jbd_unlock_bh_state(bh); - /* commit wakes up all shadow buffers after IO */ - for ( ; ; ) { - prepare_to_wait(wqh, &wait.wait, - TASK_UNINTERRUPTIBLE); - if (jh->b_jlist != BJ_Shadow) - break; - schedule(); - } - finish_wait(wqh, &wait.wait); - goto repeat; - } - - /* Only do the copy if the currently-owning transaction - * still needs it. If it is on the Forget list, the - * committing transaction is past that stage. The - * buffer had better remain locked during the kmalloc, - * but that should be true --- we hold the journal lock - * still and the buffer is already on the BUF_JOURNAL - * list so won't be flushed. - * - * Subtle point, though: if this is a get_undo_access, - * then we will be relying on the frozen_data to contain - * the new value of the committed_data record after the - * transaction, so we HAVE to force the frozen_data copy - * in that case. */ - - if (jh->b_jlist != BJ_Forget || force_copy) { - JBUFFER_TRACE(jh, "generate frozen data"); - if (!frozen_buffer) { - JBUFFER_TRACE(jh, "allocate memory for buffer"); - jbd_unlock_bh_state(bh); - frozen_buffer = - jbd_alloc(jh2bh(jh)->b_size, - GFP_NOFS); - if (!frozen_buffer) { - printk(KERN_ERR - "%s: OOM for frozen_buffer\n", - __func__); - JBUFFER_TRACE(jh, "oom!"); - error = -ENOMEM; - jbd_lock_bh_state(bh); - goto done; - } - goto repeat; - } - jh->b_frozen_data = frozen_buffer; - frozen_buffer = NULL; - need_copy = 1; - } - jh->b_next_transaction = transaction; - } - - - /* - * Finally, if the buffer is not journaled right now, we need to make - * sure it doesn't get written to disk before the caller actually - * commits the new data - */ - if (!jh->b_transaction) { - JBUFFER_TRACE(jh, "no transaction"); - J_ASSERT_JH(jh, !jh->b_next_transaction); - JBUFFER_TRACE(jh, "file as BJ_Reserved"); - spin_lock(&journal->j_list_lock); - __journal_file_buffer(jh, transaction, BJ_Reserved); - spin_unlock(&journal->j_list_lock); - } - -done: - if (need_copy) { - struct page *page; - int offset; - char *source; - - J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), - "Possible IO failure.\n"); - page = jh2bh(jh)->b_page; - offset = offset_in_page(jh2bh(jh)->b_data); - source = kmap_atomic(page); - memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); - kunmap_atomic(source); - } - jbd_unlock_bh_state(bh); - - /* - * If we are about to journal a buffer, then any revoke pending on it is - * no longer valid - */ - journal_cancel_revoke(handle, jh); - -out: - if (unlikely(frozen_buffer)) /* It's usually NULL */ - jbd_free(frozen_buffer, bh->b_size); - - JBUFFER_TRACE(jh, "exit"); - return error; -} - -/** - * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. - * @handle: transaction to add buffer modifications to - * @bh: bh to be used for metadata writes - * - * Returns an error code or 0 on success. - * - * In full data journalling mode the buffer may be of type BJ_AsyncData, - * because we're write()ing a buffer which is also part of a shared mapping. - */ - -int journal_get_write_access(handle_t *handle, struct buffer_head *bh) -{ - struct journal_head *jh = journal_add_journal_head(bh); - int rc; - - /* We do not want to get caught playing with fields which the - * log thread also manipulates. Make sure that the buffer - * completes any outstanding IO before proceeding. */ - rc = do_get_write_access(handle, jh, 0); - journal_put_journal_head(jh); - return rc; -} - - -/* - * When the user wants to journal a newly created buffer_head - * (ie. getblk() returned a new buffer and we are going to populate it - * manually rather than reading off disk), then we need to keep the - * buffer_head locked until it has been completely filled with new - * data. In this case, we should be able to make the assertion that - * the bh is not already part of an existing transaction. - * - * The buffer should already be locked by the caller by this point. - * There is no lock ranking violation: it was a newly created, - * unlocked buffer beforehand. */ - -/** - * int journal_get_create_access () - notify intent to use newly created bh - * @handle: transaction to new buffer to - * @bh: new buffer. - * - * Call this if you create a new bh. - */ -int journal_get_create_access(handle_t *handle, struct buffer_head *bh) -{ - transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; - struct journal_head *jh = journal_add_journal_head(bh); - int err; - - jbd_debug(5, "journal_head %p\n", jh); - err = -EROFS; - if (is_handle_aborted(handle)) - goto out; - err = 0; - - JBUFFER_TRACE(jh, "entry"); - /* - * The buffer may already belong to this transaction due to pre-zeroing - * in the filesystem's new_block code. It may also be on the previous, - * committing transaction's lists, but it HAS to be in Forget state in - * that case: the transaction must have deleted the buffer for it to be - * reused here. - */ - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - J_ASSERT_JH(jh, (jh->b_transaction == transaction || - jh->b_transaction == NULL || - (jh->b_transaction == journal->j_committing_transaction && - jh->b_jlist == BJ_Forget))); - - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); - - if (jh->b_transaction == NULL) { - /* - * Previous journal_forget() could have left the buffer - * with jbddirty bit set because it was being committed. When - * the commit finished, we've filed the buffer for - * checkpointing and marked it dirty. Now we are reallocating - * the buffer so the transaction freeing it must have - * committed and so it's safe to clear the dirty bit. - */ - clear_buffer_dirty(jh2bh(jh)); - - /* first access by this transaction */ - jh->b_modified = 0; - - JBUFFER_TRACE(jh, "file as BJ_Reserved"); - __journal_file_buffer(jh, transaction, BJ_Reserved); - } else if (jh->b_transaction == journal->j_committing_transaction) { - /* first access by this transaction */ - jh->b_modified = 0; - - JBUFFER_TRACE(jh, "set next transaction"); - jh->b_next_transaction = transaction; - } - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - - /* - * akpm: I added this. ext3_alloc_branch can pick up new indirect - * blocks which contain freed but then revoked metadata. We need - * to cancel the revoke in case we end up freeing it yet again - * and the reallocating as data - this would cause a second revoke, - * which hits an assertion error. - */ - JBUFFER_TRACE(jh, "cancelling revoke"); - journal_cancel_revoke(handle, jh); -out: - journal_put_journal_head(jh); - return err; -} - -/** - * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences - * @handle: transaction - * @bh: buffer to undo - * - * Sometimes there is a need to distinguish between metadata which has - * been committed to disk and that which has not. The ext3fs code uses - * this for freeing and allocating space, we have to make sure that we - * do not reuse freed space until the deallocation has been committed, - * since if we overwrote that space we would make the delete - * un-rewindable in case of a crash. - * - * To deal with that, journal_get_undo_access requests write access to a - * buffer for parts of non-rewindable operations such as delete - * operations on the bitmaps. The journaling code must keep a copy of - * the buffer's contents prior to the undo_access call until such time - * as we know that the buffer has definitely been committed to disk. - * - * We never need to know which transaction the committed data is part - * of, buffers touched here are guaranteed to be dirtied later and so - * will be committed to a new transaction in due course, at which point - * we can discard the old committed data pointer. - * - * Returns error number or 0 on success. - */ -int journal_get_undo_access(handle_t *handle, struct buffer_head *bh) -{ - int err; - struct journal_head *jh = journal_add_journal_head(bh); - char *committed_data = NULL; - - JBUFFER_TRACE(jh, "entry"); - - /* - * Do this first --- it can drop the journal lock, so we want to - * make sure that obtaining the committed_data is done - * atomically wrt. completion of any outstanding commits. - */ - err = do_get_write_access(handle, jh, 1); - if (err) - goto out; - -repeat: - if (!jh->b_committed_data) { - committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS); - if (!committed_data) { - printk(KERN_ERR "%s: No memory for committed data\n", - __func__); - err = -ENOMEM; - goto out; - } - } - - jbd_lock_bh_state(bh); - if (!jh->b_committed_data) { - /* Copy out the current buffer contents into the - * preserved, committed copy. */ - JBUFFER_TRACE(jh, "generate b_committed data"); - if (!committed_data) { - jbd_unlock_bh_state(bh); - goto repeat; - } - - jh->b_committed_data = committed_data; - committed_data = NULL; - memcpy(jh->b_committed_data, bh->b_data, bh->b_size); - } - jbd_unlock_bh_state(bh); -out: - journal_put_journal_head(jh); - if (unlikely(committed_data)) - jbd_free(committed_data, bh->b_size); - return err; -} - -/** - * int journal_dirty_data() - mark a buffer as containing dirty data to be flushed - * @handle: transaction - * @bh: bufferhead to mark - * - * Description: - * Mark a buffer as containing dirty data which needs to be flushed before - * we can commit the current transaction. - * - * The buffer is placed on the transaction's data list and is marked as - * belonging to the transaction. - * - * Returns error number or 0 on success. - * - * journal_dirty_data() can be called via page_launder->ext3_writepage - * by kswapd. - */ -int journal_dirty_data(handle_t *handle, struct buffer_head *bh) -{ - journal_t *journal = handle->h_transaction->t_journal; - int need_brelse = 0; - struct journal_head *jh; - int ret = 0; - - if (is_handle_aborted(handle)) - return ret; - - jh = journal_add_journal_head(bh); - JBUFFER_TRACE(jh, "entry"); - - /* - * The buffer could *already* be dirty. Writeout can start - * at any time. - */ - jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid); - - /* - * What if the buffer is already part of a running transaction? - * - * There are two cases: - * 1) It is part of the current running transaction. Refile it, - * just in case we have allocated it as metadata, deallocated - * it, then reallocated it as data. - * 2) It is part of the previous, still-committing transaction. - * If all we want to do is to guarantee that the buffer will be - * written to disk before this new transaction commits, then - * being sure that the *previous* transaction has this same - * property is sufficient for us! Just leave it on its old - * transaction. - * - * In case (2), the buffer must not already exist as metadata - * --- that would violate write ordering (a transaction is free - * to write its data at any point, even before the previous - * committing transaction has committed). The caller must - * never, ever allow this to happen: there's nothing we can do - * about it in this layer. - */ - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - - /* Now that we have bh_state locked, are we really still mapped? */ - if (!buffer_mapped(bh)) { - JBUFFER_TRACE(jh, "unmapped buffer, bailing out"); - goto no_journal; - } - - if (jh->b_transaction) { - JBUFFER_TRACE(jh, "has transaction"); - if (jh->b_transaction != handle->h_transaction) { - JBUFFER_TRACE(jh, "belongs to older transaction"); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_committing_transaction); - - /* @@@ IS THIS TRUE ? */ - /* - * Not any more. Scenario: someone does a write() - * in data=journal mode. The buffer's transaction has - * moved into commit. Then someone does another - * write() to the file. We do the frozen data copyout - * and set b_next_transaction to point to j_running_t. - * And while we're in that state, someone does a - * writepage() in an attempt to pageout the same area - * of the file via a shared mapping. At present that - * calls journal_dirty_data(), and we get right here. - * It may be too late to journal the data. Simply - * falling through to the next test will suffice: the - * data will be dirty and wil be checkpointed. The - * ordering comments in the next comment block still - * apply. - */ - //J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - - /* - * If we're journalling data, and this buffer was - * subject to a write(), it could be metadata, forget - * or shadow against the committing transaction. Now, - * someone has dirtied the same darn page via a mapping - * and it is being writepage()'d. - * We *could* just steal the page from commit, with some - * fancy locking there. Instead, we just skip it - - * don't tie the page's buffers to the new transaction - * at all. - * Implication: if we crash before the writepage() data - * is written into the filesystem, recovery will replay - * the write() data. - */ - if (jh->b_jlist != BJ_None && - jh->b_jlist != BJ_SyncData && - jh->b_jlist != BJ_Locked) { - JBUFFER_TRACE(jh, "Not stealing"); - goto no_journal; - } - - /* - * This buffer may be undergoing writeout in commit. We - * can't return from here and let the caller dirty it - * again because that can cause the write-out loop in - * commit to never terminate. - */ - if (buffer_dirty(bh)) { - get_bh(bh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - need_brelse = 1; - sync_dirty_buffer(bh); - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - /* Since we dropped the lock... */ - if (!buffer_mapped(bh)) { - JBUFFER_TRACE(jh, "buffer got unmapped"); - goto no_journal; - } - /* The buffer may become locked again at any - time if it is redirtied */ - } - - /* - * We cannot remove the buffer with io error from the - * committing transaction, because otherwise it would - * miss the error and the commit would not abort. - */ - if (unlikely(!buffer_uptodate(bh))) { - ret = -EIO; - goto no_journal; - } - /* We might have slept so buffer could be refiled now */ - if (jh->b_transaction != NULL && - jh->b_transaction != handle->h_transaction) { - JBUFFER_TRACE(jh, "unfile from commit"); - __journal_temp_unlink_buffer(jh); - /* It still points to the committing - * transaction; move it to this one so - * that the refile assert checks are - * happy. */ - jh->b_transaction = handle->h_transaction; - } - /* The buffer will be refiled below */ - - } - /* - * Special case --- the buffer might actually have been - * allocated and then immediately deallocated in the previous, - * committing transaction, so might still be left on that - * transaction's metadata lists. - */ - if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { - JBUFFER_TRACE(jh, "not on correct data list: unfile"); - J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); - JBUFFER_TRACE(jh, "file as data"); - __journal_file_buffer(jh, handle->h_transaction, - BJ_SyncData); - } - } else { - JBUFFER_TRACE(jh, "not on a transaction"); - __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); - } -no_journal: - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - if (need_brelse) { - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - } - JBUFFER_TRACE(jh, "exit"); - journal_put_journal_head(jh); - return ret; -} - -/** - * int journal_dirty_metadata() - mark a buffer as containing dirty metadata - * @handle: transaction to add buffer to. - * @bh: buffer to mark - * - * Mark dirty metadata which needs to be journaled as part of the current - * transaction. - * - * The buffer is placed on the transaction's metadata list and is marked - * as belonging to the transaction. - * - * Returns error number or 0 on success. - * - * Special care needs to be taken if the buffer already belongs to the - * current committing transaction (in which case we should have frozen - * data present for that commit). In that case, we don't relink the - * buffer: that only gets done when the old transaction finally - * completes its commit. - */ -int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) -{ - transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; - struct journal_head *jh = bh2jh(bh); - - jbd_debug(5, "journal_head %p\n", jh); - JBUFFER_TRACE(jh, "entry"); - if (is_handle_aborted(handle)) - goto out; - - jbd_lock_bh_state(bh); - - if (jh->b_modified == 0) { - /* - * This buffer's got modified and becoming part - * of the transaction. This needs to be done - * once a transaction -bzzz - */ - jh->b_modified = 1; - J_ASSERT_JH(jh, handle->h_buffer_credits > 0); - handle->h_buffer_credits--; - } - - /* - * fastpath, to avoid expensive locking. If this buffer is already - * on the running transaction's metadata list there is nothing to do. - * Nobody can take it off again because there is a handle open. - * I _think_ we're OK here with SMP barriers - a mistaken decision will - * result in this test being false, so we go in and take the locks. - */ - if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) { - JBUFFER_TRACE(jh, "fastpath"); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_running_transaction); - goto out_unlock_bh; - } - - set_buffer_jbddirty(bh); - - /* - * Metadata already on the current transaction list doesn't - * need to be filed. Metadata on another transaction's list must - * be committing, and will be refiled once the commit completes: - * leave it alone for now. - */ - if (jh->b_transaction != transaction) { - JBUFFER_TRACE(jh, "already on other transaction"); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_committing_transaction); - J_ASSERT_JH(jh, jh->b_next_transaction == transaction); - /* And this case is illegal: we can't reuse another - * transaction's data buffer, ever. */ - goto out_unlock_bh; - } - - /* That test should have eliminated the following case: */ - J_ASSERT_JH(jh, jh->b_frozen_data == NULL); - - JBUFFER_TRACE(jh, "file as BJ_Metadata"); - spin_lock(&journal->j_list_lock); - __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); - spin_unlock(&journal->j_list_lock); -out_unlock_bh: - jbd_unlock_bh_state(bh); -out: - JBUFFER_TRACE(jh, "exit"); - return 0; -} - -/* - * journal_release_buffer: undo a get_write_access without any buffer - * updates, if the update decided in the end that it didn't need access. - * - */ -void -journal_release_buffer(handle_t *handle, struct buffer_head *bh) -{ - BUFFER_TRACE(bh, "entry"); -} - -/** - * void journal_forget() - bforget() for potentially-journaled buffers. - * @handle: transaction handle - * @bh: bh to 'forget' - * - * We can only do the bforget if there are no commits pending against the - * buffer. If the buffer is dirty in the current running transaction we - * can safely unlink it. - * - * bh may not be a journalled buffer at all - it may be a non-JBD - * buffer which came off the hashtable. Check for this. - * - * Decrements bh->b_count by one. - * - * Allow this call even if the handle has aborted --- it may be part of - * the caller's cleanup after an abort. - */ -int journal_forget (handle_t *handle, struct buffer_head *bh) -{ - transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; - struct journal_head *jh; - int drop_reserve = 0; - int err = 0; - int was_modified = 0; - - BUFFER_TRACE(bh, "entry"); - - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - - if (!buffer_jbd(bh)) - goto not_jbd; - jh = bh2jh(bh); - - /* Critical error: attempting to delete a bitmap buffer, maybe? - * Don't do any jbd operations, and return an error. */ - if (!J_EXPECT_JH(jh, !jh->b_committed_data, - "inconsistent data on disk")) { - err = -EIO; - goto not_jbd; - } - - /* keep track of whether or not this transaction modified us */ - was_modified = jh->b_modified; - - /* - * The buffer's going from the transaction, we must drop - * all references -bzzz - */ - jh->b_modified = 0; - - if (jh->b_transaction == handle->h_transaction) { - J_ASSERT_JH(jh, !jh->b_frozen_data); - - /* If we are forgetting a buffer which is already part - * of this transaction, then we can just drop it from - * the transaction immediately. */ - clear_buffer_dirty(bh); - clear_buffer_jbddirty(bh); - - JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); - - /* - * we only want to drop a reference if this transaction - * modified the buffer - */ - if (was_modified) - drop_reserve = 1; - - /* - * We are no longer going to journal this buffer. - * However, the commit of this transaction is still - * important to the buffer: the delete that we are now - * processing might obsolete an old log entry, so by - * committing, we can satisfy the buffer's checkpoint. - * - * So, if we have a checkpoint on the buffer, we should - * now refile the buffer on our BJ_Forget list so that - * we know to remove the checkpoint after we commit. - */ - - if (jh->b_cp_transaction) { - __journal_temp_unlink_buffer(jh); - __journal_file_buffer(jh, transaction, BJ_Forget); - } else { - __journal_unfile_buffer(jh); - if (!buffer_jbd(bh)) { - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - __bforget(bh); - goto drop; - } - } - } else if (jh->b_transaction) { - J_ASSERT_JH(jh, (jh->b_transaction == - journal->j_committing_transaction)); - /* However, if the buffer is still owned by a prior - * (committing) transaction, we can't drop it yet... */ - JBUFFER_TRACE(jh, "belongs to older transaction"); - /* ... but we CAN drop it from the new transaction if we - * have also modified it since the original commit. */ - - if (jh->b_next_transaction) { - J_ASSERT(jh->b_next_transaction == transaction); - jh->b_next_transaction = NULL; - - /* - * only drop a reference if this transaction modified - * the buffer - */ - if (was_modified) - drop_reserve = 1; - } - } - -not_jbd: - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - __brelse(bh); -drop: - if (drop_reserve) { - /* no need to reserve log space for this block -bzzz */ - handle->h_buffer_credits++; - } - return err; -} - -/** - * int journal_stop() - complete a transaction - * @handle: tranaction to complete. - * - * All done for a particular handle. - * - * There is not much action needed here. We just return any remaining - * buffer credits to the transaction and remove the handle. The only - * complication is that we need to start a commit operation if the - * filesystem is marked for synchronous update. - * - * journal_stop itself will not usually return an error, but it may - * do so in unusual circumstances. In particular, expect it to - * return -EIO if a journal_abort has been executed since the - * transaction began. - */ -int journal_stop(handle_t *handle) -{ - transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; - int err; - pid_t pid; - - J_ASSERT(journal_current_handle() == handle); - - if (is_handle_aborted(handle)) - err = -EIO; - else { - J_ASSERT(transaction->t_updates > 0); - err = 0; - } - - if (--handle->h_ref > 0) { - jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, - handle->h_ref); - return err; - } - - jbd_debug(4, "Handle %p going down\n", handle); - - /* - * Implement synchronous transaction batching. If the handle - * was synchronous, don't force a commit immediately. Let's - * yield and let another thread piggyback onto this transaction. - * Keep doing that while new threads continue to arrive. - * It doesn't cost much - we're about to run a commit and sleep - * on IO anyway. Speeds up many-threaded, many-dir operations - * by 30x or more... - * - * We try and optimize the sleep time against what the underlying disk - * can do, instead of having a static sleep time. This is useful for - * the case where our storage is so fast that it is more optimal to go - * ahead and force a flush and wait for the transaction to be committed - * than it is to wait for an arbitrary amount of time for new writers to - * join the transaction. We achieve this by measuring how long it takes - * to commit a transaction, and compare it with how long this - * transaction has been running, and if run time < commit time then we - * sleep for the delta and commit. This greatly helps super fast disks - * that would see slowdowns as more threads started doing fsyncs. - * - * But don't do this if this process was the most recent one to - * perform a synchronous write. We do this to detect the case where a - * single process is doing a stream of sync writes. No point in waiting - * for joiners in that case. - */ - pid = current->pid; - if (handle->h_sync && journal->j_last_sync_writer != pid) { - u64 commit_time, trans_time; - - journal->j_last_sync_writer = pid; - - spin_lock(&journal->j_state_lock); - commit_time = journal->j_average_commit_time; - spin_unlock(&journal->j_state_lock); - - trans_time = ktime_to_ns(ktime_sub(ktime_get(), - transaction->t_start_time)); - - commit_time = min_t(u64, commit_time, - 1000*jiffies_to_usecs(1)); - - if (trans_time < commit_time) { - ktime_t expires = ktime_add_ns(ktime_get(), - commit_time); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); - } - } - - current->journal_info = NULL; - spin_lock(&journal->j_state_lock); - spin_lock(&transaction->t_handle_lock); - transaction->t_outstanding_credits -= handle->h_buffer_credits; - transaction->t_updates--; - if (!transaction->t_updates) { - wake_up(&journal->j_wait_updates); - if (journal->j_barrier_count) - wake_up(&journal->j_wait_transaction_locked); - } - - /* - * If the handle is marked SYNC, we need to set another commit - * going! We also want to force a commit if the current - * transaction is occupying too much of the log, or if the - * transaction is too old now. - */ - if (handle->h_sync || - transaction->t_outstanding_credits > - journal->j_max_transaction_buffers || - time_after_eq(jiffies, transaction->t_expires)) { - /* Do this even for aborted journals: an abort still - * completes the commit thread, it just doesn't write - * anything to disk. */ - tid_t tid = transaction->t_tid; - - spin_unlock(&transaction->t_handle_lock); - jbd_debug(2, "transaction too old, requesting commit for " - "handle %p\n", handle); - /* This is non-blocking */ - __log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); - - /* - * Special case: JFS_SYNC synchronous updates require us - * to wait for the commit to complete. - */ - if (handle->h_sync && !(current->flags & PF_MEMALLOC)) - err = log_wait_commit(journal, tid); - } else { - spin_unlock(&transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); - } - - lock_map_release(&handle->h_lockdep_map); - - jbd_free_handle(handle); - return err; -} - -/** - * int journal_force_commit() - force any uncommitted transactions - * @journal: journal to force - * - * For synchronous operations: force any uncommitted transactions - * to disk. May seem kludgy, but it reuses all the handle batching - * code in a very simple manner. - */ -int journal_force_commit(journal_t *journal) -{ - handle_t *handle; - int ret; - - handle = journal_start(journal, 1); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - } else { - handle->h_sync = 1; - ret = journal_stop(handle); - } - return ret; -} - -/* - * - * List management code snippets: various functions for manipulating the - * transaction buffer lists. - * - */ - -/* - * Append a buffer to a transaction list, given the transaction's list head - * pointer. - * - * j_list_lock is held. - * - * jbd_lock_bh_state(jh2bh(jh)) is held. - */ - -static inline void -__blist_add_buffer(struct journal_head **list, struct journal_head *jh) -{ - if (!*list) { - jh->b_tnext = jh->b_tprev = jh; - *list = jh; - } else { - /* Insert at the tail of the list to preserve order */ - struct journal_head *first = *list, *last = first->b_tprev; - jh->b_tprev = last; - jh->b_tnext = first; - last->b_tnext = first->b_tprev = jh; - } -} - -/* - * Remove a buffer from a transaction list, given the transaction's list - * head pointer. - * - * Called with j_list_lock held, and the journal may not be locked. - * - * jbd_lock_bh_state(jh2bh(jh)) is held. - */ - -static inline void -__blist_del_buffer(struct journal_head **list, struct journal_head *jh) -{ - if (*list == jh) { - *list = jh->b_tnext; - if (*list == jh) - *list = NULL; - } - jh->b_tprev->b_tnext = jh->b_tnext; - jh->b_tnext->b_tprev = jh->b_tprev; -} - -/* - * Remove a buffer from the appropriate transaction list. - * - * Note that this function can *change* the value of - * bh->b_transaction->t_sync_datalist, t_buffers, t_forget, - * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller - * is holding onto a copy of one of thee pointers, it could go bad. - * Generally the caller needs to re-read the pointer from the transaction_t. - * - * Called under j_list_lock. The journal may not be locked. - */ -static void __journal_temp_unlink_buffer(struct journal_head *jh) -{ - struct journal_head **list = NULL; - transaction_t *transaction; - struct buffer_head *bh = jh2bh(jh); - - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); - transaction = jh->b_transaction; - if (transaction) - assert_spin_locked(&transaction->t_journal->j_list_lock); - - J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); - if (jh->b_jlist != BJ_None) - J_ASSERT_JH(jh, transaction != NULL); - - switch (jh->b_jlist) { - case BJ_None: - return; - case BJ_SyncData: - list = &transaction->t_sync_datalist; - break; - case BJ_Metadata: - transaction->t_nr_buffers--; - J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); - list = &transaction->t_buffers; - break; - case BJ_Forget: - list = &transaction->t_forget; - break; - case BJ_IO: - list = &transaction->t_iobuf_list; - break; - case BJ_Shadow: - list = &transaction->t_shadow_list; - break; - case BJ_LogCtl: - list = &transaction->t_log_list; - break; - case BJ_Reserved: - list = &transaction->t_reserved_list; - break; - case BJ_Locked: - list = &transaction->t_locked_list; - break; - } - - __blist_del_buffer(list, jh); - jh->b_jlist = BJ_None; - if (test_clear_buffer_jbddirty(bh)) - mark_buffer_dirty(bh); /* Expose it to the VM */ -} - -/* - * Remove buffer from all transactions. - * - * Called with bh_state lock and j_list_lock - * - * jh and bh may be already freed when this function returns. - */ -void __journal_unfile_buffer(struct journal_head *jh) -{ - __journal_temp_unlink_buffer(jh); - jh->b_transaction = NULL; - journal_put_journal_head(jh); -} - -void journal_unfile_buffer(journal_t *journal, struct journal_head *jh) -{ - struct buffer_head *bh = jh2bh(jh); - - /* Get reference so that buffer cannot be freed before we unlock it */ - get_bh(bh); - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - __journal_unfile_buffer(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - __brelse(bh); -} - -/* - * Called from journal_try_to_free_buffers(). - * - * Called under jbd_lock_bh_state(bh) - */ -static void -__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) -{ - struct journal_head *jh; - - jh = bh2jh(bh); - - if (buffer_locked(bh) || buffer_dirty(bh)) - goto out; - - if (jh->b_next_transaction != NULL) - goto out; - - spin_lock(&journal->j_list_lock); - if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) { - if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { - /* A written-back ordered data buffer */ - JBUFFER_TRACE(jh, "release data"); - __journal_unfile_buffer(jh); - } - } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { - /* written-back checkpointed metadata buffer */ - if (jh->b_jlist == BJ_None) { - JBUFFER_TRACE(jh, "remove from checkpoint list"); - __journal_remove_checkpoint(jh); - } - } - spin_unlock(&journal->j_list_lock); -out: - return; -} - -/** - * int journal_try_to_free_buffers() - try to free page buffers. - * @journal: journal for operation - * @page: to try and free - * @gfp_mask: we use the mask to detect how hard should we try to release - * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to - * release the buffers. - * - * - * For all the buffers on this page, - * if they are fully written out ordered data, move them onto BUF_CLEAN - * so try_to_free_buffers() can reap them. - * - * This function returns non-zero if we wish try_to_free_buffers() - * to be called. We do this if the page is releasable by try_to_free_buffers(). - * We also do it if the page has locked or dirty buffers and the caller wants - * us to perform sync or async writeout. - * - * This complicates JBD locking somewhat. We aren't protected by the - * BKL here. We wish to remove the buffer from its committing or - * running transaction's ->t_datalist via __journal_unfile_buffer. - * - * This may *change* the value of transaction_t->t_datalist, so anyone - * who looks at t_datalist needs to lock against this function. - * - * Even worse, someone may be doing a journal_dirty_data on this - * buffer. So we need to lock against that. journal_dirty_data() - * will come out of the lock with the buffer dirty, which makes it - * ineligible for release here. - * - * Who else is affected by this? hmm... Really the only contender - * is do_get_write_access() - it could be looking at the buffer while - * journal_try_to_free_buffer() is changing its state. But that - * cannot happen because we never reallocate freed data as metadata - * while the data is part of a transaction. Yes? - * - * Return 0 on failure, 1 on success - */ -int journal_try_to_free_buffers(journal_t *journal, - struct page *page, gfp_t gfp_mask) -{ - struct buffer_head *head; - struct buffer_head *bh; - int ret = 0; - - J_ASSERT(PageLocked(page)); - - head = page_buffers(page); - bh = head; - do { - struct journal_head *jh; - - /* - * We take our own ref against the journal_head here to avoid - * having to add tons of locking around each instance of - * journal_put_journal_head(). - */ - jh = journal_grab_journal_head(bh); - if (!jh) - continue; - - jbd_lock_bh_state(bh); - __journal_try_to_free_buffer(journal, bh); - journal_put_journal_head(jh); - jbd_unlock_bh_state(bh); - if (buffer_jbd(bh)) - goto busy; - } while ((bh = bh->b_this_page) != head); - - ret = try_to_free_buffers(page); - -busy: - return ret; -} - -/* - * This buffer is no longer needed. If it is on an older transaction's - * checkpoint list we need to record it on this transaction's forget list - * to pin this buffer (and hence its checkpointing transaction) down until - * this transaction commits. If the buffer isn't on a checkpoint list, we - * release it. - * Returns non-zero if JBD no longer has an interest in the buffer. - * - * Called under j_list_lock. - * - * Called under jbd_lock_bh_state(bh). - */ -static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) -{ - int may_free = 1; - struct buffer_head *bh = jh2bh(jh); - - if (jh->b_cp_transaction) { - JBUFFER_TRACE(jh, "on running+cp transaction"); - __journal_temp_unlink_buffer(jh); - /* - * We don't want to write the buffer anymore, clear the - * bit so that we don't confuse checks in - * __journal_file_buffer - */ - clear_buffer_dirty(bh); - __journal_file_buffer(jh, transaction, BJ_Forget); - may_free = 0; - } else { - JBUFFER_TRACE(jh, "on running transaction"); - __journal_unfile_buffer(jh); - } - return may_free; -} - -/* - * journal_invalidatepage - * - * This code is tricky. It has a number of cases to deal with. - * - * There are two invariants which this code relies on: - * - * i_size must be updated on disk before we start calling invalidatepage on the - * data. - * - * This is done in ext3 by defining an ext3_setattr method which - * updates i_size before truncate gets going. By maintaining this - * invariant, we can be sure that it is safe to throw away any buffers - * attached to the current transaction: once the transaction commits, - * we know that the data will not be needed. - * - * Note however that we can *not* throw away data belonging to the - * previous, committing transaction! - * - * Any disk blocks which *are* part of the previous, committing - * transaction (and which therefore cannot be discarded immediately) are - * not going to be reused in the new running transaction - * - * The bitmap committed_data images guarantee this: any block which is - * allocated in one transaction and removed in the next will be marked - * as in-use in the committed_data bitmap, so cannot be reused until - * the next transaction to delete the block commits. This means that - * leaving committing buffers dirty is quite safe: the disk blocks - * cannot be reallocated to a different file and so buffer aliasing is - * not possible. - * - * - * The above applies mainly to ordered data mode. In writeback mode we - * don't make guarantees about the order in which data hits disk --- in - * particular we don't guarantee that new dirty data is flushed before - * transaction commit --- so it is always safe just to discard data - * immediately in that mode. --sct - */ - -/* - * The journal_unmap_buffer helper function returns zero if the buffer - * concerned remains pinned as an anonymous buffer belonging to an older - * transaction. - * - * We're outside-transaction here. Either or both of j_running_transaction - * and j_committing_transaction may be NULL. - */ -static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, - int partial_page) -{ - transaction_t *transaction; - struct journal_head *jh; - int may_free = 1; - - BUFFER_TRACE(bh, "entry"); - -retry: - /* - * It is safe to proceed here without the j_list_lock because the - * buffers cannot be stolen by try_to_free_buffers as long as we are - * holding the page lock. --sct - */ - - if (!buffer_jbd(bh)) - goto zap_buffer_unlocked; - - spin_lock(&journal->j_state_lock); - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - - jh = journal_grab_journal_head(bh); - if (!jh) - goto zap_buffer_no_jh; - - /* - * We cannot remove the buffer from checkpoint lists until the - * transaction adding inode to orphan list (let's call it T) - * is committed. Otherwise if the transaction changing the - * buffer would be cleaned from the journal before T is - * committed, a crash will cause that the correct contents of - * the buffer will be lost. On the other hand we have to - * clear the buffer dirty bit at latest at the moment when the - * transaction marking the buffer as freed in the filesystem - * structures is committed because from that moment on the - * block can be reallocated and used by a different page. - * Since the block hasn't been freed yet but the inode has - * already been added to orphan list, it is safe for us to add - * the buffer to BJ_Forget list of the newest transaction. - * - * Also we have to clear buffer_mapped flag of a truncated buffer - * because the buffer_head may be attached to the page straddling - * i_size (can happen only when blocksize < pagesize) and thus the - * buffer_head can be reused when the file is extended again. So we end - * up keeping around invalidated buffers attached to transactions' - * BJ_Forget list just to stop checkpointing code from cleaning up - * the transaction this buffer was modified in. - */ - transaction = jh->b_transaction; - if (transaction == NULL) { - /* First case: not on any transaction. If it - * has no checkpoint link, then we can zap it: - * it's a writeback-mode buffer so we don't care - * if it hits disk safely. */ - if (!jh->b_cp_transaction) { - JBUFFER_TRACE(jh, "not on any transaction: zap"); - goto zap_buffer; - } - - if (!buffer_dirty(bh)) { - /* bdflush has written it. We can drop it now */ - goto zap_buffer; - } - - /* OK, it must be in the journal but still not - * written fully to disk: it's metadata or - * journaled data... */ - - if (journal->j_running_transaction) { - /* ... and once the current transaction has - * committed, the buffer won't be needed any - * longer. */ - JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); - may_free = __dispose_buffer(jh, - journal->j_running_transaction); - goto zap_buffer; - } else { - /* There is no currently-running transaction. So the - * orphan record which we wrote for this file must have - * passed into commit. We must attach this buffer to - * the committing transaction, if it exists. */ - if (journal->j_committing_transaction) { - JBUFFER_TRACE(jh, "give to committing trans"); - may_free = __dispose_buffer(jh, - journal->j_committing_transaction); - goto zap_buffer; - } else { - /* The orphan record's transaction has - * committed. We can cleanse this buffer */ - clear_buffer_jbddirty(bh); - goto zap_buffer; - } - } - } else if (transaction == journal->j_committing_transaction) { - JBUFFER_TRACE(jh, "on committing transaction"); - if (jh->b_jlist == BJ_Locked) { - /* - * The buffer is on the committing transaction's locked - * list. We have the buffer locked, so I/O has - * completed. So we can nail the buffer now. - */ - may_free = __dispose_buffer(jh, transaction); - goto zap_buffer; - } - /* - * The buffer is committing, we simply cannot touch - * it. If the page is straddling i_size we have to wait - * for commit and try again. - */ - if (partial_page) { - tid_t tid = journal->j_committing_transaction->t_tid; - - journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); - unlock_buffer(bh); - log_wait_commit(journal, tid); - lock_buffer(bh); - goto retry; - } - /* - * OK, buffer won't be reachable after truncate. We just set - * j_next_transaction to the running transaction (if there is - * one) and mark buffer as freed so that commit code knows it - * should clear dirty bits when it is done with the buffer. - */ - set_buffer_freed(bh); - if (journal->j_running_transaction && buffer_jbddirty(bh)) - jh->b_next_transaction = journal->j_running_transaction; - journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); - return 0; - } else { - /* Good, the buffer belongs to the running transaction. - * We are writing our own transaction's data, not any - * previous one's, so it is safe to throw it away - * (remember that we expect the filesystem to have set - * i_size already for this truncate so recovery will not - * expose the disk blocks we are discarding here.) */ - J_ASSERT_JH(jh, transaction == journal->j_running_transaction); - JBUFFER_TRACE(jh, "on running transaction"); - may_free = __dispose_buffer(jh, transaction); - } - -zap_buffer: - /* - * This is tricky. Although the buffer is truncated, it may be reused - * if blocksize < pagesize and it is attached to the page straddling - * EOF. Since the buffer might have been added to BJ_Forget list of the - * running transaction, journal_get_write_access() won't clear - * b_modified and credit accounting gets confused. So clear b_modified - * here. */ - jh->b_modified = 0; - journal_put_journal_head(jh); -zap_buffer_no_jh: - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); -zap_buffer_unlocked: - clear_buffer_dirty(bh); - J_ASSERT_BH(bh, !buffer_jbddirty(bh)); - clear_buffer_mapped(bh); - clear_buffer_req(bh); - clear_buffer_new(bh); - bh->b_bdev = NULL; - return may_free; -} - -/** - * void journal_invalidatepage() - invalidate a journal page - * @journal: journal to use for flush - * @page: page to flush - * @offset: offset of the range to invalidate - * @length: length of the range to invalidate - * - * Reap page buffers containing data in specified range in page. - */ -void journal_invalidatepage(journal_t *journal, - struct page *page, - unsigned int offset, - unsigned int length) -{ - struct buffer_head *head, *bh, *next; - unsigned int stop = offset + length; - unsigned int curr_off = 0; - int partial_page = (offset || length < PAGE_CACHE_SIZE); - int may_free = 1; - - if (!PageLocked(page)) - BUG(); - if (!page_has_buffers(page)) - return; - - BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); - - /* We will potentially be playing with lists other than just the - * data lists (especially for journaled data mode), so be - * cautious in our locking. */ - - head = bh = page_buffers(page); - do { - unsigned int next_off = curr_off + bh->b_size; - next = bh->b_this_page; - - if (next_off > stop) - return; - - if (offset <= curr_off) { - /* This block is wholly outside the truncation point */ - lock_buffer(bh); - may_free &= journal_unmap_buffer(journal, bh, - partial_page); - unlock_buffer(bh); - } - curr_off = next_off; - bh = next; - - } while (bh != head); - - if (!partial_page) { - if (may_free && try_to_free_buffers(page)) - J_ASSERT(!page_has_buffers(page)); - } -} - -/* - * File a buffer on the given transaction list. - */ -void __journal_file_buffer(struct journal_head *jh, - transaction_t *transaction, int jlist) -{ - struct journal_head **list = NULL; - int was_dirty = 0; - struct buffer_head *bh = jh2bh(jh); - - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); - assert_spin_locked(&transaction->t_journal->j_list_lock); - - J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); - J_ASSERT_JH(jh, jh->b_transaction == transaction || - jh->b_transaction == NULL); - - if (jh->b_transaction && jh->b_jlist == jlist) - return; - - if (jlist == BJ_Metadata || jlist == BJ_Reserved || - jlist == BJ_Shadow || jlist == BJ_Forget) { - /* - * For metadata buffers, we track dirty bit in buffer_jbddirty - * instead of buffer_dirty. We should not see a dirty bit set - * here because we clear it in do_get_write_access but e.g. - * tune2fs can modify the sb and set the dirty bit at any time - * so we try to gracefully handle that. - */ - if (buffer_dirty(bh)) - warn_dirty_buffer(bh); - if (test_clear_buffer_dirty(bh) || - test_clear_buffer_jbddirty(bh)) - was_dirty = 1; - } - - if (jh->b_transaction) - __journal_temp_unlink_buffer(jh); - else - journal_grab_journal_head(bh); - jh->b_transaction = transaction; - - switch (jlist) { - case BJ_None: - J_ASSERT_JH(jh, !jh->b_committed_data); - J_ASSERT_JH(jh, !jh->b_frozen_data); - return; - case BJ_SyncData: - list = &transaction->t_sync_datalist; - break; - case BJ_Metadata: - transaction->t_nr_buffers++; - list = &transaction->t_buffers; - break; - case BJ_Forget: - list = &transaction->t_forget; - break; - case BJ_IO: - list = &transaction->t_iobuf_list; - break; - case BJ_Shadow: - list = &transaction->t_shadow_list; - break; - case BJ_LogCtl: - list = &transaction->t_log_list; - break; - case BJ_Reserved: - list = &transaction->t_reserved_list; - break; - case BJ_Locked: - list = &transaction->t_locked_list; - break; - } - - __blist_add_buffer(list, jh); - jh->b_jlist = jlist; - - if (was_dirty) - set_buffer_jbddirty(bh); -} - -void journal_file_buffer(struct journal_head *jh, - transaction_t *transaction, int jlist) -{ - jbd_lock_bh_state(jh2bh(jh)); - spin_lock(&transaction->t_journal->j_list_lock); - __journal_file_buffer(jh, transaction, jlist); - spin_unlock(&transaction->t_journal->j_list_lock); - jbd_unlock_bh_state(jh2bh(jh)); -} - -/* - * Remove a buffer from its current buffer list in preparation for - * dropping it from its current transaction entirely. If the buffer has - * already started to be used by a subsequent transaction, refile the - * buffer on that transaction's metadata list. - * - * Called under j_list_lock - * Called under jbd_lock_bh_state(jh2bh(jh)) - * - * jh and bh may be already free when this function returns - */ -void __journal_refile_buffer(struct journal_head *jh) -{ - int was_dirty, jlist; - struct buffer_head *bh = jh2bh(jh); - - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); - if (jh->b_transaction) - assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); - - /* If the buffer is now unused, just drop it. */ - if (jh->b_next_transaction == NULL) { - __journal_unfile_buffer(jh); - return; - } - - /* - * It has been modified by a later transaction: add it to the new - * transaction's metadata list. - */ - - was_dirty = test_clear_buffer_jbddirty(bh); - __journal_temp_unlink_buffer(jh); - /* - * We set b_transaction here because b_next_transaction will inherit - * our jh reference and thus __journal_file_buffer() must not take a - * new one. - */ - jh->b_transaction = jh->b_next_transaction; - jh->b_next_transaction = NULL; - if (buffer_freed(bh)) - jlist = BJ_Forget; - else if (jh->b_modified) - jlist = BJ_Metadata; - else - jlist = BJ_Reserved; - __journal_file_buffer(jh, jh->b_transaction, jlist); - J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); - - if (was_dirty) - set_buffer_jbddirty(bh); -} - -/* - * __journal_refile_buffer() with necessary locking added. We take our bh - * reference so that we can safely unlock bh. - * - * The jh and bh may be freed by this call. - */ -void journal_refile_buffer(journal_t *journal, struct journal_head *jh) -{ - struct buffer_head *bh = jh2bh(jh); - - /* Get reference so that buffer cannot be freed before we unlock it */ - get_bh(bh); - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - __journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_list_lock); - __brelse(bh); -} diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 4227dc4f7437..8c44654ce274 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -417,12 +417,12 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * journal_clean_one_cp_list * * Find all the written-back checkpoint buffers in the given list and - * release them. + * release them. If 'destroy' is set, clean all buffers unconditionally. * * Called with j_list_lock held. * Returns 1 if we freed the transaction, 0 otherwise. */ -static int journal_clean_one_cp_list(struct journal_head *jh) +static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) { struct journal_head *last_jh; struct journal_head *next_jh = jh; @@ -436,7 +436,10 @@ static int journal_clean_one_cp_list(struct journal_head *jh) do { jh = next_jh; next_jh = jh->b_cpnext; - ret = __try_to_free_cp_buf(jh); + if (!destroy) + ret = __try_to_free_cp_buf(jh); + else + ret = __jbd2_journal_remove_checkpoint(jh) + 1; if (!ret) return freed; if (ret == 2) @@ -459,10 +462,11 @@ static int journal_clean_one_cp_list(struct journal_head *jh) * journal_clean_checkpoint_list * * Find all the written-back checkpoint buffers in the journal and release them. + * If 'destroy' is set, release all buffers unconditionally. * * Called with j_list_lock held. */ -void __jbd2_journal_clean_checkpoint_list(journal_t *journal) +void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) { transaction_t *transaction, *last_transaction, *next_transaction; int ret; @@ -476,7 +480,8 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal) do { transaction = next_transaction; next_transaction = transaction->t_cpnext; - ret = journal_clean_one_cp_list(transaction->t_checkpoint_list); + ret = journal_clean_one_cp_list(transaction->t_checkpoint_list, + destroy); /* * This function only frees up some memory if possible so we * dont have an obligation to finish processing. Bail out if @@ -492,7 +497,7 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal) * we can possibly see not yet submitted buffers on io_list */ ret = journal_clean_one_cp_list(transaction-> - t_checkpoint_io_list); + t_checkpoint_io_list, destroy); if (need_resched()) return; /* @@ -506,6 +511,28 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal) } /* + * Remove buffers from all checkpoint lists as journal is aborted and we just + * need to free memory + */ +void jbd2_journal_destroy_checkpoint(journal_t *journal) +{ + /* + * We loop because __jbd2_journal_clean_checkpoint_list() may abort + * early due to a need of rescheduling. + */ + while (1) { + spin_lock(&journal->j_list_lock); + if (!journal->j_checkpoint_transactions) { + spin_unlock(&journal->j_list_lock); + break; + } + __jbd2_journal_clean_checkpoint_list(journal, true); + spin_unlock(&journal->j_list_lock); + cond_resched(); + } +} + +/* * journal_remove_checkpoint: called after a buffer has been committed * to disk (either by being write-back flushed to disk, or being * committed to the log). diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index b73e0215baa7..362e5f614450 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -510,7 +510,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) * frees some memory */ spin_lock(&journal->j_list_lock); - __jbd2_journal_clean_checkpoint_list(journal); + __jbd2_journal_clean_checkpoint_list(journal, false); spin_unlock(&journal->j_list_lock); jbd_debug(3, "JBD2: commit phase 1\n"); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 4ff3fad4e9e3..8270fe9e3641 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1456,7 +1456,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal) sb->s_errno = cpu_to_be32(journal->j_errno); read_unlock(&journal->j_state_lock); - jbd2_write_superblock(journal, WRITE_SYNC); + jbd2_write_superblock(journal, WRITE_FUA); } EXPORT_SYMBOL(jbd2_journal_update_sb_errno); @@ -1693,8 +1693,17 @@ int jbd2_journal_destroy(journal_t *journal) while (journal->j_checkpoint_transactions != NULL) { spin_unlock(&journal->j_list_lock); mutex_lock(&journal->j_checkpoint_mutex); - jbd2_log_do_checkpoint(journal); + err = jbd2_log_do_checkpoint(journal); mutex_unlock(&journal->j_checkpoint_mutex); + /* + * If checkpointing failed, just free the buffers to avoid + * looping forever + */ + if (err) { + jbd2_journal_destroy_checkpoint(journal); + spin_lock(&journal->j_list_lock); + break; + } spin_lock(&journal->j_list_lock); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index f3d06174b051..6b8338ec2464 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -204,6 +204,20 @@ static int add_transaction_credits(journal_t *journal, int blocks, * attach this handle to a new transaction. */ atomic_sub(total, &t->t_outstanding_credits); + + /* + * Is the number of reserved credits in the current transaction too + * big to fit this handle? Wait until reserved credits are freed. + */ + if (atomic_read(&journal->j_reserved_credits) + total > + journal->j_max_transaction_buffers) { + read_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_reserved, + atomic_read(&journal->j_reserved_credits) + total <= + journal->j_max_transaction_buffers); + return 1; + } + wait_transaction_locked(journal); return 1; } @@ -262,20 +276,24 @@ static int start_this_handle(journal_t *journal, handle_t *handle, int rsv_blocks = 0; unsigned long ts = jiffies; + if (handle->h_rsv_handle) + rsv_blocks = handle->h_rsv_handle->h_buffer_credits; + /* - * 1/2 of transaction can be reserved so we can practically handle - * only 1/2 of maximum transaction size per operation + * Limit the number of reserved credits to 1/2 of maximum transaction + * size and limit the number of total credits to not exceed maximum + * transaction size per operation. */ - if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) { - printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n", - current->comm, blocks, - journal->j_max_transaction_buffers / 2); + if ((rsv_blocks > journal->j_max_transaction_buffers / 2) || + (rsv_blocks + blocks > journal->j_max_transaction_buffers)) { + printk(KERN_ERR "JBD2: %s wants too many credits " + "credits:%d rsv_credits:%d max:%d\n", + current->comm, blocks, rsv_blocks, + journal->j_max_transaction_buffers); + WARN_ON(1); return -ENOSPC; } - if (handle->h_rsv_handle) - rsv_blocks = handle->h_rsv_handle->h_buffer_credits; - alloc_transaction: if (!journal->j_running_transaction) { /* @@ -1280,8 +1298,6 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh, triggers->t_abort(triggers, jh2bh(jh)); } - - /** * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata * @handle: transaction to add buffer to. @@ -1314,12 +1330,41 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) if (is_handle_aborted(handle)) return -EROFS; - journal = transaction->t_journal; - jh = jbd2_journal_grab_journal_head(bh); - if (!jh) { + if (!buffer_jbd(bh)) { ret = -EUCLEAN; goto out; } + /* + * We don't grab jh reference here since the buffer must be part + * of the running transaction. + */ + jh = bh2jh(bh); + /* + * This and the following assertions are unreliable since we may see jh + * in inconsistent state unless we grab bh_state lock. But this is + * crucial to catch bugs so let's do a reliable check until the + * lockless handling is fully proven. + */ + if (jh->b_transaction != transaction && + jh->b_next_transaction != transaction) { + jbd_lock_bh_state(bh); + J_ASSERT_JH(jh, jh->b_transaction == transaction || + jh->b_next_transaction == transaction); + jbd_unlock_bh_state(bh); + } + if (jh->b_modified == 1) { + /* If it's in our transaction it must be in BJ_Metadata list. */ + if (jh->b_transaction == transaction && + jh->b_jlist != BJ_Metadata) { + jbd_lock_bh_state(bh); + J_ASSERT_JH(jh, jh->b_transaction != transaction || + jh->b_jlist == BJ_Metadata); + jbd_unlock_bh_state(bh); + } + goto out; + } + + journal = transaction->t_journal; jbd_debug(5, "journal_head %p\n", jh); JBUFFER_TRACE(jh, "entry"); @@ -1410,7 +1455,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) spin_unlock(&journal->j_list_lock); out_unlock_bh: jbd_unlock_bh_state(bh); - jbd2_journal_put_journal_head(jh); out: JBUFFER_TRACE(jh, "exit"); return ret; diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index d200a9b8fd5e..824e61ede465 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -19,7 +19,7 @@ struct kstatfs; struct kvec; -#define JFFS2_INODE_INFO(i) (list_entry(i, struct jffs2_inode_info, vfs_inode)) +#define JFFS2_INODE_INFO(i) (container_of(i, struct jffs2_inode_info, vfs_inode)) #define OFNI_EDONI_2SFFJ(f) (&(f)->vfs_inode) #define JFFS2_SB_INFO(sb) (sb->s_fs_info) #define OFNI_BS_2SFFJ(c) ((struct super_block *)c->os_priv) diff --git a/fs/jfs/file.c b/fs/jfs/file.c index e98d39d75cf4..0e026a7bdcd4 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -76,7 +76,7 @@ static int jfs_open(struct inode *inode, struct file *file) if (ji->active_ag == -1) { struct jfs_sb_info *jfs_sb = JFS_SBI(inode->i_sb); ji->active_ag = BLKTOAG(addressPXD(&ji->ixpxd), jfs_sb); - atomic_inc( &jfs_sb->bmap->db_active[ji->active_ag]); + atomic_inc(&jfs_sb->bmap->db_active[ji->active_ag]); } spin_unlock_irq(&ji->ag_lock); } @@ -107,8 +107,11 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr) if (rc) return rc; - if (is_quota_modification(inode, iattr)) - dquot_initialize(inode); + if (is_quota_modification(inode, iattr)) { + rc = dquot_initialize(inode); + if (rc) + return rc; + } if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { rc = dquot_transfer(inode, iattr); diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 6f1cb2b5ee28..41aa3ca6a6a4 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -134,11 +134,11 @@ int jfs_write_inode(struct inode *inode, struct writeback_control *wbc) * It has been committed since the last change, but was still * on the dirty inode list. */ - if (!test_cflag(COMMIT_Dirty, inode)) { + if (!test_cflag(COMMIT_Dirty, inode)) { /* Make sure committed changes hit the disk */ jfs_flush_journal(JFS_SBI(inode->i_sb)->log, wait); return 0; - } + } if (jfs_commit_inode(inode, wait)) { jfs_err("jfs_write_inode: jfs_commit_inode failed!"); diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 93a1232894f6..8db8b7d61e40 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -180,9 +180,6 @@ long jfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case JFS_IOC_SETFLAGS32: cmd = JFS_IOC_SETFLAGS; break; - case FITRIM: - cmd = FITRIM; - break; } return jfs_ioctl(filp, cmd, arg); } diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index fa7e795bd8ae..1f26d1910409 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -206,7 +206,7 @@ struct jfs_sb_info { static inline struct jfs_inode_info *JFS_IP(struct inode *inode) { - return list_entry(inode, struct jfs_inode_info, vfs_inode); + return container_of(inode, struct jfs_inode_info, vfs_inode); } static inline int jfs_dirtable_inline(struct inode *inode) diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 6b0f816201a2..cf7936fe2e68 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -109,7 +109,9 @@ struct inode *ialloc(struct inode *parent, umode_t mode) /* * Allocate inode to quota. */ - dquot_initialize(inode); + rc = dquot_initialize(inode); + if (rc) + goto fail_drop; rc = dquot_alloc_inode(inode); if (rc) goto fail_drop; diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index bc462dcd7a40..a69bdf2a1085 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1999,19 +1999,16 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); bio->bi_bdev = log->bdev; - bio->bi_io_vec[0].bv_page = bp->l_page; - bio->bi_io_vec[0].bv_len = LOGPSIZE; - bio->bi_io_vec[0].bv_offset = bp->l_offset; - bio->bi_vcnt = 1; - bio->bi_iter.bi_size = LOGPSIZE; + bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); + BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); bio->bi_end_io = lbmIODone; bio->bi_private = bp; /*check if journaling to disk has been disabled*/ if (log->no_integrity) { bio->bi_iter.bi_size = 0; - lbmIODone(bio, 0); + lbmIODone(bio); } else { submit_bio(READ_SYNC, bio); } @@ -2145,12 +2142,9 @@ static void lbmStartIO(struct lbuf * bp) bio = bio_alloc(GFP_NOFS, 1); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); bio->bi_bdev = log->bdev; - bio->bi_io_vec[0].bv_page = bp->l_page; - bio->bi_io_vec[0].bv_len = LOGPSIZE; - bio->bi_io_vec[0].bv_offset = bp->l_offset; - bio->bi_vcnt = 1; - bio->bi_iter.bi_size = LOGPSIZE; + bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); + BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); bio->bi_end_io = lbmIODone; bio->bi_private = bp; @@ -2158,7 +2152,7 @@ static void lbmStartIO(struct lbuf * bp) /* check if journaling to disk has been disabled */ if (log->no_integrity) { bio->bi_iter.bi_size = 0; - lbmIODone(bio, 0); + lbmIODone(bio); } else { submit_bio(WRITE_SYNC, bio); INCREMENT(lmStat.submitted); @@ -2196,7 +2190,7 @@ static int lbmIOWait(struct lbuf * bp, int flag) * * executed at INTIODONE level */ -static void lbmIODone(struct bio *bio, int error) +static void lbmIODone(struct bio *bio) { struct lbuf *bp = bio->bi_private; struct lbuf *nextbp, *tail; @@ -2212,7 +2206,7 @@ static void lbmIODone(struct bio *bio, int error) bp->l_flag |= lbmDONE; - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + if (bio->bi_error) { bp->l_flag |= lbmERROR; jfs_err("lbmIODone: I/O error in JFS log"); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 16a0922beb59..a3eb316b1ac3 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -276,11 +276,11 @@ static void last_read_complete(struct page *page) unlock_page(page); } -static void metapage_read_end_io(struct bio *bio, int err) +static void metapage_read_end_io(struct bio *bio) { struct page *page = bio->bi_private; - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + if (bio->bi_error) { printk(KERN_ERR "metapage_read_end_io: I/O error\n"); SetPageError(page); } @@ -331,13 +331,13 @@ static void last_write_complete(struct page *page) end_page_writeback(page); } -static void metapage_write_end_io(struct bio *bio, int err) +static void metapage_write_end_io(struct bio *bio) { struct page *page = bio->bi_private; BUG_ON(!PagePrivate(page)); - if (! test_bit(BIO_UPTODATE, &bio->bi_flags)) { + if (bio->bi_error) { printk(KERN_ERR "metapage_write_end_io: I/O error\n"); SetPageError(page); } diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index e33be921aa41..35976bdccafc 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -86,7 +86,9 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode, jfs_info("jfs_create: dip:0x%p name:%pd", dip, dentry); - dquot_initialize(dip); + rc = dquot_initialize(dip); + if (rc) + goto out1; /* * search parent directory for entry/freespace @@ -218,7 +220,9 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) jfs_info("jfs_mkdir: dip:0x%p name:%pd", dip, dentry); - dquot_initialize(dip); + rc = dquot_initialize(dip); + if (rc) + goto out1; /* * search parent directory for entry/freespace @@ -355,8 +359,12 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) jfs_info("jfs_rmdir: dip:0x%p name:%pd", dip, dentry); /* Init inode for quota operations. */ - dquot_initialize(dip); - dquot_initialize(ip); + rc = dquot_initialize(dip); + if (rc) + goto out; + rc = dquot_initialize(ip); + if (rc) + goto out; /* directory must be empty to be removed */ if (!dtEmpty(ip)) { @@ -483,8 +491,12 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) jfs_info("jfs_unlink: dip:0x%p name:%pd", dip, dentry); /* Init inode for quota operations. */ - dquot_initialize(dip); - dquot_initialize(ip); + rc = dquot_initialize(dip); + if (rc) + goto out; + rc = dquot_initialize(ip); + if (rc) + goto out; if ((rc = get_UCSname(&dname, dentry))) goto out; @@ -799,7 +811,9 @@ static int jfs_link(struct dentry *old_dentry, jfs_info("jfs_link: %pd %pd", old_dentry, dentry); - dquot_initialize(dir); + rc = dquot_initialize(dir); + if (rc) + goto out; tid = txBegin(ip->i_sb, 0); @@ -810,7 +824,7 @@ static int jfs_link(struct dentry *old_dentry, * scan parent directory for entry/freespace */ if ((rc = get_UCSname(&dname, dentry))) - goto out; + goto out_tx; if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE))) goto free_dname; @@ -842,12 +856,13 @@ static int jfs_link(struct dentry *old_dentry, free_dname: free_UCSname(&dname); - out: + out_tx: txEnd(tid); mutex_unlock(&JFS_IP(ip)->commit_mutex); mutex_unlock(&JFS_IP(dir)->commit_mutex); + out: jfs_info("jfs_link: rc:%d", rc); return rc; } @@ -891,7 +906,9 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name); - dquot_initialize(dip); + rc = dquot_initialize(dip); + if (rc) + goto out1; ssize = strlen(name) + 1; @@ -1082,8 +1099,12 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, jfs_info("jfs_rename: %pd %pd", old_dentry, new_dentry); - dquot_initialize(old_dir); - dquot_initialize(new_dir); + rc = dquot_initialize(old_dir); + if (rc) + goto out1; + rc = dquot_initialize(new_dir); + if (rc) + goto out1; old_ip = d_inode(old_dentry); new_ip = d_inode(new_dentry); @@ -1130,7 +1151,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, } else if (new_ip) { IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL); /* Init inode for quota operations. */ - dquot_initialize(new_ip); + rc = dquot_initialize(new_ip); + if (rc) + goto out_unlock; } /* @@ -1160,7 +1183,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, rc = dtModify(tid, new_dir, &new_dname, &ino, old_ip->i_ino, JFS_RENAME); if (rc) - goto out4; + goto out_tx; drop_nlink(new_ip); if (S_ISDIR(new_ip->i_mode)) { drop_nlink(new_ip); @@ -1185,7 +1208,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, if ((new_size = commitZeroLink(tid, new_ip)) < 0) { txAbort(tid, 1); /* Marks FS Dirty */ rc = new_size; - goto out4; + goto out_tx; } tblk = tid_to_tblock(tid); tblk->xflag |= COMMIT_DELETE; @@ -1203,7 +1226,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (rc) { jfs_err("jfs_rename didn't expect dtSearch to fail " "w/rc = %d", rc); - goto out4; + goto out_tx; } ino = old_ip->i_ino; @@ -1211,7 +1234,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (rc) { if (rc == -EIO) jfs_err("jfs_rename: dtInsert returned -EIO"); - goto out4; + goto out_tx; } if (S_ISDIR(old_ip->i_mode)) inc_nlink(new_dir); @@ -1226,7 +1249,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, jfs_err("jfs_rename did not expect dtDelete to return rc = %d", rc); txAbort(tid, 1); /* Marks Filesystem dirty */ - goto out4; + goto out_tx; } if (S_ISDIR(old_ip->i_mode)) { drop_nlink(old_dir); @@ -1285,7 +1308,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, rc = txCommit(tid, ipcount, iplist, commit_flag); - out4: + out_tx: txEnd(tid); if (new_ip) mutex_unlock(&JFS_IP(new_ip)->commit_mutex); @@ -1308,13 +1331,6 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, } if (new_ip && (new_ip->i_nlink == 0)) set_cflag(COMMIT_Nolink, new_ip); - out3: - free_UCSname(&new_dname); - out2: - free_UCSname(&old_dname); - out1: - if (new_ip && !S_ISDIR(new_ip->i_mode)) - IWRITE_UNLOCK(new_ip); /* * Truncating the directory index table is not guaranteed. It * may need to be done iteratively @@ -1325,7 +1341,14 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, clear_cflag(COMMIT_Stale, old_dir); } - + out_unlock: + if (new_ip && !S_ISDIR(new_ip->i_mode)) + IWRITE_UNLOCK(new_ip); + out3: + free_UCSname(&new_dname); + out2: + free_UCSname(&old_dname); + out1: jfs_info("jfs_rename: returning %d", rc); return rc; } @@ -1354,7 +1377,9 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, jfs_info("jfs_mknod: %pd", dentry); - dquot_initialize(dir); + rc = dquot_initialize(dir); + if (rc) + goto out; if ((rc = get_UCSname(&dname, dentry))) goto out; diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index fffca9517321..91e004518237 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -92,6 +92,29 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) } /** + * kernfs_path_len - determine the length of the full path of a given node + * @kn: kernfs_node of interest + * + * The returned length doesn't include the space for the terminating '\0'. + */ +size_t kernfs_path_len(struct kernfs_node *kn) +{ + size_t len = 0; + unsigned long flags; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + + do { + len += strlen(kn->name) + 1; + kn = kn->parent; + } while (kn && kn->parent); + + spin_unlock_irqrestore(&kernfs_rename_lock, flags); + + return len; +} + +/** * kernfs_path - build full path of a given node * @kn: kernfs_node of interest * @buf: buffer to copy @kn's name into @@ -592,6 +615,9 @@ int kernfs_add_one(struct kernfs_node *kn) goto out_unlock; ret = -ENOENT; + if (parent->flags & KERNFS_EMPTY_DIR) + goto out_unlock; + if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent)) goto out_unlock; @@ -783,6 +809,38 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, return ERR_PTR(rc); } +/** + * kernfs_create_empty_dir - create an always empty directory + * @parent: parent in which to create a new directory + * @name: name of the new directory + * + * Returns the created node on success, ERR_PTR() value on failure. + */ +struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, + const char *name) +{ + struct kernfs_node *kn; + int rc; + + /* allocate */ + kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, KERNFS_DIR); + if (!kn) + return ERR_PTR(-ENOMEM); + + kn->flags |= KERNFS_EMPTY_DIR; + kn->dir.root = parent->dir.root; + kn->ns = NULL; + kn->priv = NULL; + + /* link in */ + rc = kernfs_add_one(kn); + if (!rc) + return kn; + + kernfs_put(kn); + return ERR_PTR(rc); +} + static struct dentry *kernfs_iop_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) @@ -1254,7 +1312,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, mutex_lock(&kernfs_mutex); error = -ENOENT; - if (!kernfs_active(kn) || !kernfs_active(new_parent)) + if (!kernfs_active(kn) || !kernfs_active(new_parent) || + (new_parent->flags & KERNFS_EMPTY_DIR)) goto out; error = 0; diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 2da8493a380b..756dd56aaf60 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -296,6 +296,8 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) case KERNFS_DIR: inode->i_op = &kernfs_dir_iops; inode->i_fop = &kernfs_dir_fops; + if (kn->flags & KERNFS_EMPTY_DIR) + make_empty_dir_inode(inode); break; case KERNFS_FILE: inode->i_size = kn->attr.size; diff --git a/fs/libfs.c b/fs/libfs.c index 65e1feca8b98..c7cbfb092e94 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -20,11 +20,6 @@ #include "internal.h" -static inline int simple_positive(struct dentry *dentry) -{ - return d_really_is_positive(dentry) && !d_unhashed(dentry); -} - int simple_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { @@ -1108,3 +1103,98 @@ const struct inode_operations simple_symlink_inode_operations = { .readlink = generic_readlink }; EXPORT_SYMBOL(simple_symlink_inode_operations); + +/* + * Operations for a permanently empty directory. + */ +static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + return ERR_PTR(-ENOENT); +} + +static int empty_dir_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = d_inode(dentry); + generic_fillattr(inode, stat); + return 0; +} + +static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr) +{ + return -EPERM; +} + +static int empty_dir_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static ssize_t empty_dir_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size) +{ + return -EOPNOTSUPP; +} + +static int empty_dir_removexattr(struct dentry *dentry, const char *name) +{ + return -EOPNOTSUPP; +} + +static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size) +{ + return -EOPNOTSUPP; +} + +static const struct inode_operations empty_dir_inode_operations = { + .lookup = empty_dir_lookup, + .permission = generic_permission, + .setattr = empty_dir_setattr, + .getattr = empty_dir_getattr, + .setxattr = empty_dir_setxattr, + .getxattr = empty_dir_getxattr, + .removexattr = empty_dir_removexattr, + .listxattr = empty_dir_listxattr, +}; + +static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence) +{ + /* An empty directory has two entries . and .. at offsets 0 and 1 */ + return generic_file_llseek_size(file, offset, whence, 2, 2); +} + +static int empty_dir_readdir(struct file *file, struct dir_context *ctx) +{ + dir_emit_dots(file, ctx); + return 0; +} + +static const struct file_operations empty_dir_operations = { + .llseek = empty_dir_llseek, + .read = generic_read_dir, + .iterate = empty_dir_readdir, + .fsync = noop_fsync, +}; + + +void make_empty_dir_inode(struct inode *inode) +{ + set_nlink(inode, 2); + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_rdev = 0; + inode->i_size = 0; + inode->i_blkbits = PAGE_SHIFT; + inode->i_blocks = 0; + + inode->i_op = &empty_dir_inode_operations; + inode->i_fop = &empty_dir_operations; +} + +bool is_empty_dir_inode(struct inode *inode) +{ + return (inode->i_fop == &empty_dir_operations) && + (inode->i_op == &empty_dir_inode_operations); +} diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 55505cbe11af..d678bcc3cbcb 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -322,6 +322,11 @@ out_rqst: return error; } +static struct svc_serv_ops lockd_sv_ops = { + .svo_shutdown = svc_rpcb_cleanup, + .svo_enqueue_xprt = svc_xprt_do_enqueue, +}; + static struct svc_serv *lockd_create_svc(void) { struct svc_serv *serv; @@ -350,7 +355,7 @@ static struct svc_serv *lockd_create_svc(void) nlm_timeout = LOCKD_DFLT_TIMEO; nlmsvc_timeout = nlm_timeout * HZ; - serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, svc_rpcb_cleanup); + serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, &lockd_sv_ops); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); return ERR_PTR(-ENOMEM); @@ -586,6 +591,7 @@ static int lockd_init_net(struct net *net) INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender); INIT_LIST_HEAD(&ln->lockd_manager.list); + ln->lockd_manager.block_opens = false; spin_lock_init(&ln->nsm_clnt_lock); return 0; } diff --git a/fs/locks.c b/fs/locks.c index 653faabb07f4..2a54c800a223 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -862,12 +862,11 @@ static int posix_locks_deadlock(struct file_lock *caller_fl, * whether or not a lock was successfully freed by testing the return * value for -ENOENT. */ -static int flock_lock_file(struct file *filp, struct file_lock *request) +static int flock_lock_inode(struct inode *inode, struct file_lock *request) { struct file_lock *new_fl = NULL; struct file_lock *fl; struct file_lock_context *ctx; - struct inode *inode = file_inode(filp); int error = 0; bool found = false; LIST_HEAD(dispose); @@ -890,7 +889,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) goto find_conflict; list_for_each_entry(fl, &ctx->flc_flock, fl_list) { - if (filp != fl->fl_file) + if (request->fl_file != fl->fl_file) continue; if (request->fl_type == fl->fl_type) goto out; @@ -1164,20 +1163,19 @@ int posix_lock_file(struct file *filp, struct file_lock *fl, EXPORT_SYMBOL(posix_lock_file); /** - * posix_lock_file_wait - Apply a POSIX-style lock to a file - * @filp: The file to apply the lock to + * posix_lock_inode_wait - Apply a POSIX-style lock to a file + * @inode: inode of file to which lock request should be applied * @fl: The lock to be applied * - * Add a POSIX style lock to a file. - * We merge adjacent & overlapping locks whenever possible. - * POSIX locks are sorted by owner task, then by starting address + * Variant of posix_lock_file_wait that does not take a filp, and so can be + * used after the filp has already been torn down. */ -int posix_lock_file_wait(struct file *filp, struct file_lock *fl) +int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) { int error; might_sleep (); for (;;) { - error = posix_lock_file(filp, fl, NULL); + error = __posix_lock_file(inode, fl, NULL); if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); @@ -1189,7 +1187,7 @@ int posix_lock_file_wait(struct file *filp, struct file_lock *fl) } return error; } -EXPORT_SYMBOL(posix_lock_file_wait); +EXPORT_SYMBOL(posix_lock_inode_wait); /** * locks_mandatory_locked - Check for an active lock @@ -1570,6 +1568,7 @@ int fcntl_getlease(struct file *filp) * desired lease. * @dentry: dentry to check * @arg: type of lease that we're trying to acquire + * @flags: current lock flags * * Check to see if there's an existing open fd on this file that would * conflict with the lease we're trying to set. @@ -1851,18 +1850,18 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg) } /** - * flock_lock_file_wait - Apply a FLOCK-style lock to a file - * @filp: The file to apply the lock to + * flock_lock_inode_wait - Apply a FLOCK-style lock to a file + * @inode: inode of the file to apply to * @fl: The lock to be applied * - * Add a FLOCK style lock to a file. + * Apply a FLOCK style lock request to an inode. */ -int flock_lock_file_wait(struct file *filp, struct file_lock *fl) +int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl) { int error; might_sleep(); for (;;) { - error = flock_lock_file(filp, fl); + error = flock_lock_inode(inode, fl); if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); @@ -1874,8 +1873,7 @@ int flock_lock_file_wait(struct file *filp, struct file_lock *fl) } return error; } - -EXPORT_SYMBOL(flock_lock_file_wait); +EXPORT_SYMBOL(flock_lock_inode_wait); /** * sys_flock: - flock() system call. @@ -2401,7 +2399,8 @@ locks_remove_flock(struct file *filp) .fl_type = F_UNLCK, .fl_end = OFFSET_MAX, }; - struct file_lock_context *flctx = file_inode(filp)->i_flctx; + struct inode *inode = file_inode(filp); + struct file_lock_context *flctx = inode->i_flctx; if (list_empty(&flctx->flc_flock)) return; @@ -2409,7 +2408,7 @@ locks_remove_flock(struct file *filp) if (filp->f_op->flock) filp->f_op->flock(filp, F_SETLKW, &fl); else - flock_lock_file(filp, &fl); + flock_lock_inode(inode, &fl); if (fl.fl_ops && fl.fl_ops->fl_release_private) fl.fl_ops->fl_release_private(&fl); diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index 76279e11982d..a7fdbd868474 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -53,16 +53,14 @@ static int bdev_readpage(void *_sb, struct page *page) static DECLARE_WAIT_QUEUE_HEAD(wq); -static void writeseg_end_io(struct bio *bio, int err) +static void writeseg_end_io(struct bio *bio) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec; int i; struct super_block *sb = bio->bi_private; struct logfs_super *super = logfs_super(sb); - BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ - BUG_ON(err); + BUG_ON(bio->bi_error); /* FIXME: Retry io or write elsewhere */ bio_for_each_segment_all(bvec, bio, i) { end_page_writeback(bvec->bv_page); @@ -83,7 +81,7 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, unsigned int max_pages; int i; - max_pages = min(nr_pages, (size_t) bio_get_nr_vecs(super->s_bdev)); + max_pages = min(nr_pages, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOFS, max_pages); BUG_ON(!bio); @@ -153,14 +151,12 @@ static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len) } -static void erase_end_io(struct bio *bio, int err) +static void erase_end_io(struct bio *bio) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct super_block *sb = bio->bi_private; struct logfs_super *super = logfs_super(sb); - BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ - BUG_ON(err); + BUG_ON(bio->bi_error); /* FIXME: Retry io or write elsewhere */ BUG_ON(bio->bi_vcnt == 0); bio_put(bio); if (atomic_dec_and_test(&super->s_pending_writes)) @@ -175,7 +171,7 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, unsigned int max_pages; int i; - max_pages = min(nr_pages, (size_t) bio_get_nr_vecs(super->s_bdev)); + max_pages = min(nr_pages, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOFS, max_pages); BUG_ON(!bio); diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 118e4e7bc935..d19ac258105a 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -45,11 +45,6 @@ minix_last_byte(struct inode *inode, unsigned long page_nr) return last_byte; } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) { struct address_space *mapping = page->mapping; diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 1ebd11854622..01ad81dcacc5 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -84,7 +84,7 @@ static inline struct minix_sb_info *minix_sb(struct super_block *sb) static inline struct minix_inode_info *minix_i(struct inode *inode) { - return list_entry(inode, struct minix_inode_info, vfs_inode); + return container_of(inode, struct minix_inode_info, vfs_inode); } static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize) diff --git a/fs/mount.h b/fs/mount.h index b5b8082bfa42..14db05d424f7 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -118,7 +118,6 @@ static inline void unlock_mount_hash(void) } struct proc_mounts { - struct seq_file m; struct mnt_namespace *ns; struct path root; int (*show)(struct seq_file *, struct vfsmount *); @@ -127,8 +126,6 @@ struct proc_mounts { loff_t cached_index; }; -#define proc_mounts(p) (container_of((p), struct proc_mounts, m)) - extern const struct seq_operations mounts_op; extern bool __is_local_mountpoint(struct dentry *dentry); diff --git a/fs/mpage.c b/fs/mpage.c index ca0244b69de8..778a4ddef77a 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -42,14 +42,14 @@ * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ -static void mpage_end_io(struct bio *bio, int err) +static void mpage_end_io(struct bio *bio) { struct bio_vec *bv; int i; bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - page_endio(page, bio_data_dir(bio), err); + page_endio(page, bio_data_dir(bio), bio->bi_error); } bio_put(bio); @@ -277,7 +277,7 @@ alloc_new: goto out; } bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), - min_t(int, nr_pages, bio_get_nr_vecs(bdev)), + min_t(int, nr_pages, BIO_MAX_PAGES), GFP_KERNEL); if (bio == NULL) goto confused; @@ -602,7 +602,7 @@ alloc_new: } } bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), - bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH); + BIO_MAX_PAGES, GFP_NOFS|__GFP_HIGH); if (bio == NULL) goto confused; diff --git a/fs/namei.c b/fs/namei.c index 2dad0eaf91d3..726d211db484 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -560,6 +560,24 @@ static int __nd_alloc_stack(struct nameidata *nd) return 0; } +/** + * path_connected - Verify that a path->dentry is below path->mnt.mnt_root + * @path: nameidate to verify + * + * Rename can sometimes move a file or directory outside of a bind + * mount, path_connected allows those cases to be detected. + */ +static bool path_connected(const struct path *path) +{ + struct vfsmount *mnt = path->mnt; + + /* Only bind mounts can have disconnected paths */ + if (mnt->mnt_root == mnt->mnt_sb->s_root) + return true; + + return is_subdir(path->dentry, mnt->mnt_root); +} + static inline int nd_alloc_stack(struct nameidata *nd) { if (likely(nd->depth != EMBEDDED_LEVELS)) @@ -792,7 +810,7 @@ static void set_root(struct nameidata *nd) get_fs_root(current->fs, &nd->root); } -static unsigned set_root_rcu(struct nameidata *nd) +static void set_root_rcu(struct nameidata *nd) { struct fs_struct *fs = current->fs; unsigned seq; @@ -802,7 +820,6 @@ static unsigned set_root_rcu(struct nameidata *nd) nd->root = fs->root; nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); } while (read_seqcount_retry(&fs->seq, seq)); - return nd->root_seq; } static void path_put_conditional(struct path *path, struct nameidata *nd) @@ -880,7 +897,7 @@ static inline int may_follow_link(struct nameidata *nd) return 0; /* Allowed if parent directory not sticky and world-writable. */ - parent = nd->path.dentry->d_inode; + parent = nd->inode; if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) return 0; @@ -1297,6 +1314,8 @@ static int follow_dotdot_rcu(struct nameidata *nd) return -ECHILD; nd->path.dentry = parent; nd->seq = seq; + if (unlikely(!path_connected(&nd->path))) + return -ENOENT; break; } else { struct mount *mnt = real_mount(nd->path.mnt); @@ -1397,7 +1416,7 @@ static void follow_mount(struct path *path) } } -static void follow_dotdot(struct nameidata *nd) +static int follow_dotdot(struct nameidata *nd) { if (!nd->root.mnt) set_root(nd); @@ -1413,6 +1432,8 @@ static void follow_dotdot(struct nameidata *nd) /* rare case of legitimate dget_parent()... */ nd->path.dentry = dget_parent(nd->path.dentry); dput(old); + if (unlikely(!path_connected(&nd->path))) + return -ENOENT; break; } if (!follow_up(&nd->path)) @@ -1420,6 +1441,7 @@ static void follow_dotdot(struct nameidata *nd) } follow_mount(&nd->path); nd->inode = nd->path.dentry->d_inode; + return 0; } /* @@ -1635,7 +1657,7 @@ static inline int handle_dots(struct nameidata *nd, int type) if (nd->flags & LOOKUP_RCU) { return follow_dotdot_rcu(nd); } else - follow_dotdot(nd); + return follow_dotdot(nd); } return 0; } @@ -1955,8 +1977,13 @@ OK: continue; } } - if (unlikely(!d_can_lookup(nd->path.dentry))) + if (unlikely(!d_can_lookup(nd->path.dentry))) { + if (nd->flags & LOOKUP_RCU) { + if (unlazy_walk(nd, NULL, 0)) + return -ECHILD; + } return -ENOTDIR; + } } } @@ -1998,7 +2025,8 @@ static const char *path_init(struct nameidata *nd, unsigned flags) if (*s == '/') { if (flags & LOOKUP_RCU) { rcu_read_lock(); - nd->seq = set_root_rcu(nd); + set_root_rcu(nd); + nd->seq = nd->root_seq; } else { set_root(nd); path_get(&nd->root); @@ -2410,7 +2438,7 @@ done: /** * path_mountpoint - look up a path to be umounted - * @nameidata: lookup context + * @nd: lookup context * @flags: lookup flags * @path: pointer to container for result * diff --git a/fs/namespace.c b/fs/namespace.c index 9c1c43d0d4f1..0570729c87fd 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1226,7 +1226,7 @@ EXPORT_SYMBOL(replace_mount_options); /* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; down_read(&namespace_sem); if (p->cached_event == p->ns->event) { @@ -1247,7 +1247,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; p->cached_mount = seq_list_next(v, &p->ns->list, pos); p->cached_index = *pos; @@ -1261,7 +1261,7 @@ static void m_stop(struct seq_file *m, void *v) static int m_show(struct seq_file *m, void *v) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; struct mount *r = list_entry(v, struct mount, mnt_list); return p->show(m, &r->mnt); } @@ -1361,6 +1361,36 @@ enum umount_tree_flags { UMOUNT_PROPAGATE = 2, UMOUNT_CONNECTED = 4, }; + +static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how) +{ + /* Leaving mounts connected is only valid for lazy umounts */ + if (how & UMOUNT_SYNC) + return true; + + /* A mount without a parent has nothing to be connected to */ + if (!mnt_has_parent(mnt)) + return true; + + /* Because the reference counting rules change when mounts are + * unmounted and connected, umounted mounts may not be + * connected to mounted mounts. + */ + if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) + return true; + + /* Has it been requested that the mount remain connected? */ + if (how & UMOUNT_CONNECTED) + return false; + + /* Is the mount locked such that it needs to remain connected? */ + if (IS_MNT_LOCKED(mnt)) + return false; + + /* By default disconnect the mount */ + return true; +} + /* * mount_lock must be held * namespace_sem must be held for write @@ -1398,10 +1428,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) if (how & UMOUNT_SYNC) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; - disconnect = !(((how & UMOUNT_CONNECTED) && - mnt_has_parent(p) && - (p->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) || - IS_MNT_LOCKED_AND_LAZY(p)); + disconnect = disconnect_mount(p, how); pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, disconnect ? &unmounted : NULL); @@ -1538,11 +1565,8 @@ void __detach_mounts(struct dentry *dentry) while (!hlist_empty(&mp->m_list)) { mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); if (mnt->mnt.mnt_flags & MNT_UMOUNT) { - struct mount *p, *tmp; - list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { - hlist_add_head(&p->mnt_umount.s_list, &unmounted); - umount_mnt(p); - } + hlist_add_head(&mnt->mnt_umount.s_list, &unmounted); + umount_mnt(mnt); } else umount_tree(mnt, UMOUNT_CONNECTED); } @@ -2343,6 +2367,8 @@ unlock: return err; } +static bool fs_fully_visible(struct file_system_type *fs_type, int *new_mnt_flags); + /* * create a new mount for userspace and request it to be added into the * namespace's tree @@ -2374,6 +2400,10 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, flags |= MS_NODEV; mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; } + if (type->fs_flags & FS_USERNS_VISIBLE) { + if (!fs_fully_visible(type, &mnt_flags)) + return -EPERM; + } } mnt = vfs_kern_mount(type, flags, name, data); @@ -3175,9 +3205,10 @@ bool current_chrooted(void) return chrooted; } -bool fs_fully_visible(struct file_system_type *type) +static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; + int new_flags = *new_mnt_flags; struct mount *mnt; bool visible = false; @@ -3187,6 +3218,8 @@ bool fs_fully_visible(struct file_system_type *type) down_read(&namespace_sem); list_for_each_entry(mnt, &ns->list, mnt_list) { struct mount *child; + int mnt_flags; + if (mnt->mnt.mnt_sb->s_type != type) continue; @@ -3196,16 +3229,51 @@ bool fs_fully_visible(struct file_system_type *type) if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) continue; - /* This mount is not fully visible if there are any child mounts - * that cover anything except for empty directories. + /* Read the mount flags and filter out flags that + * may safely be ignored. + */ + mnt_flags = mnt->mnt.mnt_flags; + if (mnt->mnt.mnt_sb->s_iflags & SB_I_NOEXEC) + mnt_flags &= ~(MNT_LOCK_NOSUID | MNT_LOCK_NOEXEC); + + /* Verify the mount flags are equal to or more permissive + * than the proposed new mount. + */ + if ((mnt_flags & MNT_LOCK_READONLY) && + !(new_flags & MNT_READONLY)) + continue; + if ((mnt_flags & MNT_LOCK_NODEV) && + !(new_flags & MNT_NODEV)) + continue; + if ((mnt_flags & MNT_LOCK_NOSUID) && + !(new_flags & MNT_NOSUID)) + continue; + if ((mnt_flags & MNT_LOCK_NOEXEC) && + !(new_flags & MNT_NOEXEC)) + continue; + if ((mnt_flags & MNT_LOCK_ATIME) && + ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK))) + continue; + + /* This mount is not fully visible if there are any + * locked child mounts that cover anything except for + * empty directories. */ list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { struct inode *inode = child->mnt_mountpoint->d_inode; - if (!S_ISDIR(inode->i_mode)) - goto next; - if (inode->i_nlink > 2) + /* Only worry about locked mounts */ + if (!(mnt_flags & MNT_LOCKED)) + continue; + /* Is the directory permanetly empty? */ + if (!is_empty_dir_inode(inode)) goto next; } + /* Preserve the locked attributes */ + *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \ + MNT_LOCK_NODEV | \ + MNT_LOCK_NOSUID | \ + MNT_LOCK_NOEXEC | \ + MNT_LOCK_ATIME); visible = true; goto found; next: ; diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 80021c709af9..93575e91a7aa 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -1145,6 +1145,8 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry, case 0x00: ncp_dbg(1, "renamed %pd -> %pd\n", old_dentry, new_dentry); + ncp_d_prune(old_dentry); + ncp_d_prune(new_dentry); break; case 0x9E: error = -ENAMETOOLONG; diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index d2554fe140a3..9cd4eb3a1e22 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -116,7 +116,7 @@ bl_submit_bio(int rw, struct bio *bio) static struct bio * bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, - void (*end_io)(struct bio *, int err), struct parallel_io *par) + bio_end_io_t end_io, struct parallel_io *par) { struct bio *bio; @@ -139,8 +139,7 @@ bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, static struct bio * do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, struct page *page, struct pnfs_block_dev_map *map, - struct pnfs_block_extent *be, - void (*end_io)(struct bio *, int err), + struct pnfs_block_extent *be, bio_end_io_t end_io, struct parallel_io *par, unsigned int offset, int *len) { struct pnfs_block_dev *dev = @@ -183,11 +182,11 @@ retry: return bio; } -static void bl_end_io_read(struct bio *bio, int err) +static void bl_end_io_read(struct bio *bio) { struct parallel_io *par = bio->bi_private; - if (err) { + if (bio->bi_error) { struct nfs_pgio_header *header = par->data; if (!header->pnfs_error) @@ -316,13 +315,12 @@ out: return PNFS_ATTEMPTED; } -static void bl_end_io_write(struct bio *bio, int err) +static void bl_end_io_write(struct bio *bio) { struct parallel_io *par = bio->bi_private; - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct nfs_pgio_header *header = par->data; - if (!uptodate) { + if (bio->bi_error) { if (!header->pnfs_error) header->pnfs_error = -EIO; pnfs_set_lo_fail(header->lseg); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 92dca9e90d8d..c556640dcf3b 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -46,13 +46,6 @@ struct pnfs_block_dev; -enum pnfs_block_volume_type { - PNFS_BLOCK_VOLUME_SIMPLE = 0, - PNFS_BLOCK_VOLUME_SLICE = 1, - PNFS_BLOCK_VOLUME_CONCAT = 2, - PNFS_BLOCK_VOLUME_STRIPE = 3, -}; - #define PNFS_BLOCK_MAX_UUIDS 4 #define PNFS_BLOCK_MAX_DEVICES 64 @@ -117,13 +110,6 @@ struct pnfs_block_dev { struct pnfs_block_dev_map *map); }; -enum exstate4 { - PNFS_BLOCK_READWRITE_DATA = 0, - PNFS_BLOCK_READ_DATA = 1, - PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ - PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ -}; - /* sector_t fields are all in 512-byte sectors */ struct pnfs_block_extent { union { @@ -134,15 +120,12 @@ struct pnfs_block_extent { sector_t be_f_offset; /* the starting offset in the file */ sector_t be_length; /* the size of the extent */ sector_t be_v_offset; /* the starting offset in the volume */ - enum exstate4 be_state; /* the state of this extent */ + enum pnfs_block_extent_state be_state; /* the state of this extent */ #define EXTENT_WRITTEN 1 #define EXTENT_COMMITTING 2 unsigned int be_tag; }; -/* on the wire size of the extent */ -#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE) - struct pnfs_block_layout { struct pnfs_layout_hdr bl_layout; struct rb_root bl_ext_rw; diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index e535599a0719..a861bbdfe577 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -22,7 +22,7 @@ bl_free_device(struct pnfs_block_dev *dev) kfree(dev->children); } else { if (dev->bdev) - blkdev_put(dev->bdev, FMODE_READ); + blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE); } } @@ -65,6 +65,11 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) return -EIO; p = xdr_decode_hyper(p, &b->simple.sigs[i].offset); b->simple.sigs[i].sig_len = be32_to_cpup(p++); + if (b->simple.sigs[i].sig_len > PNFS_BLOCK_UUID_LEN) { + pr_info("signature too long: %d\n", + b->simple.sigs[i].sig_len); + return -EIO; + } p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len); if (!p) @@ -195,7 +200,7 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, if (!dev) return -EIO; - d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); + d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); if (IS_ERR(d->bdev)) { printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev)); diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 31d0b5e53dfd..c59a59c37f3d 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -462,6 +462,12 @@ out: return err; } +static size_t ext_tree_layoutupdate_size(size_t count) +{ + return sizeof(__be32) /* number of entries */ + + PNFS_BLOCK_EXTENT_SIZE * count; +} + static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg, size_t buffer_size) { @@ -489,7 +495,7 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, continue; (*count)++; - if (*count * BL_EXTENT_SIZE > buffer_size) { + if (ext_tree_layoutupdate_size(*count) > buffer_size) { /* keep counting.. */ ret = -ENOSPC; continue; @@ -530,7 +536,7 @@ retry: if (unlikely(ret)) { ext_tree_free_commitdata(arg, buffer_size); - buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count; + buffer_size = ext_tree_layoutupdate_size(count); count = 0; arg->layoutupdate_pages = @@ -549,17 +555,14 @@ retry: } *start_p = cpu_to_be32(count); - arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count; + arg->layoutupdate_len = ext_tree_layoutupdate_size(count); if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) { - __be32 *p = start_p; + void *p = start_p, *end = p + arg->layoutupdate_len; int i = 0; - for (p = start_p; - p < start_p + arg->layoutupdate_len; - p += PAGE_SIZE) { + for ( ; p < end; p += PAGE_SIZE) arg->layoutupdate_pages[i++] = vmalloc_to_page(p); - } } dprintk("%s found %zu ranges\n", __func__, count); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 8d129bb7355a..75f7c0a7538a 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -162,10 +162,6 @@ nfs41_callback_up(struct svc_serv *serv) spin_lock_init(&serv->sv_cb_lock); init_waitqueue_head(&serv->sv_cb_waitq); rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); - if (IS_ERR(rqstp)) { - svc_xprt_put(serv->sv_bc_xprt); - serv->sv_bc_xprt = NULL; - } dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp)); return rqstp; } @@ -308,6 +304,10 @@ err_bind: return ret; } +static struct svc_serv_ops nfs_cb_sv_ops = { + .svo_enqueue_xprt = svc_xprt_do_enqueue, +}; + static struct svc_serv *nfs_callback_create_svc(int minorversion) { struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; @@ -333,7 +333,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n", cb_info->users); - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, &nfs_cb_sv_ops); if (!serv) { printk(KERN_ERR "nfs_callback_create_svc: create service failed\n"); return ERR_PTR(-ENOMEM); @@ -458,7 +458,7 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) * pg_authenticate method for nfsv4 callback threads. * * The authflavor has been negotiated, so an incorrect flavor is a server - * bug. Drop packets with incorrect authflavor. + * bug. Deny packets with incorrect authflavor. * * All other checking done after NFS decoding where the nfs_client can be * found in nfs4_callback_compound @@ -468,12 +468,12 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp) switch (rqstp->rq_authop->flavour) { case RPC_AUTH_NULL: if (rqstp->rq_proc != CB_NULL) - return SVC_DROP; + return SVC_DENIED; break; case RPC_AUTH_GSS: /* No RPC_AUTH_GSS support yet in NFSv4.1 */ if (svc_is_backchannel(rqstp)) - return SVC_DROP; + return SVC_DENIED; } return SVC_OK; } diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 197806fb87ff..b85cf7a30232 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -40,8 +40,11 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); inode = nfs_delegation_find_inode(cps->clp, &args->fh); - if (inode == NULL) + if (inode == NULL) { + trace_nfs4_cb_getattr(cps->clp, &args->fh, NULL, + -ntohl(res->status)); goto out; + } nfsi = NFS_I(inode); rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); @@ -60,6 +63,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, res->status = 0; out_iput: rcu_read_unlock(); + trace_nfs4_cb_getattr(cps->clp, &args->fh, inode, -ntohl(res->status)); iput(inode); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status)); @@ -194,6 +198,7 @@ unlock: spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); pnfs_put_layout_hdr(lo); + trace_nfs4_cb_layoutrecall_inode(clp, &args->cbl_fh, ino, -rv); iput(ino); out: return rv; @@ -327,10 +332,8 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr); /* Normal */ - if (likely(args->csa_sequenceid == slot->seq_nr + 1)) { - slot->seq_nr++; + if (likely(args->csa_sequenceid == slot->seq_nr + 1)) goto out_ok; - } /* Replay */ if (args->csa_sequenceid == slot->seq_nr) { @@ -418,6 +421,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, struct cb_process_state *cps) { struct nfs4_slot_table *tbl; + struct nfs4_slot *slot; struct nfs_client *clp; int i; __be32 status = htonl(NFS4ERR_BADSESSION); @@ -429,25 +433,32 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, if (!(clp->cl_session->flags & SESSION4_BACK_CHAN)) goto out; + tbl = &clp->cl_session->bc_slot_table; + slot = tbl->slots + args->csa_slotid; spin_lock(&tbl->slot_tbl_lock); /* state manager is resetting the session */ if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { - spin_unlock(&tbl->slot_tbl_lock); status = htonl(NFS4ERR_DELAY); /* Return NFS4ERR_BADSESSION if we're draining the session * in order to reset it. */ if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) status = htonl(NFS4ERR_BADSESSION); - goto out; + goto out_unlock; } - status = validate_seqid(&clp->cl_session->bc_slot_table, args); - spin_unlock(&tbl->slot_tbl_lock); + memcpy(&res->csr_sessionid, &args->csa_sessionid, + sizeof(res->csr_sessionid)); + res->csr_sequenceid = args->csa_sequenceid; + res->csr_slotid = args->csa_slotid; + res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; + res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; + + status = validate_seqid(tbl, args); if (status) - goto out; + goto out_unlock; cps->slotid = args->csa_slotid; @@ -458,15 +469,17 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, */ if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) { status = htonl(NFS4ERR_DELAY); - goto out; + goto out_unlock; } - memcpy(&res->csr_sessionid, &args->csa_sessionid, - sizeof(res->csr_sessionid)); - res->csr_sequenceid = args->csa_sequenceid; - res->csr_slotid = args->csa_slotid; - res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; - res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; + /* + * RFC5661 20.9.3 + * If CB_SEQUENCE returns an error, then the state of the slot + * (sequence ID, cached reply) MUST NOT change. + */ + slot->seq_nr++; +out_unlock: + spin_unlock(&tbl->slot_tbl_lock); out: cps->clp = clp; /* put in nfs4_callback_compound */ @@ -546,7 +559,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy, status = htonl(NFS4_OK); nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid); - nfs41_server_notify_target_slotid_update(cps->clp); + nfs41_notify_server(cps->clp); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 19ca95cdfd9b..6b1697a01dde 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -909,7 +909,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r xdr_init_encode(&xdr_out, &rqstp->rq_res, p); status = decode_compound_hdr_arg(&xdr_in, &hdr_arg); - if (status == __constant_htonl(NFS4ERR_RESOURCE)) + if (status == htonl(NFS4ERR_RESOURCE)) return rpc_garbage_args; if (hdr_arg.minorversion == 0) { diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 892aefff3630..57c5a02f6213 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -20,6 +20,7 @@ #include <linux/stat.h> #include <linux/errno.h> #include <linux/unistd.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/metrics.h> @@ -285,116 +286,6 @@ void nfs_put_client(struct nfs_client *clp) } EXPORT_SYMBOL_GPL(nfs_put_client); -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -/* - * Test if two ip6 socket addresses refer to the same socket by - * comparing relevant fields. The padding bytes specifically, are not - * compared. sin6_flowinfo is not compared because it only affects QoS - * and sin6_scope_id is only compared if the address is "link local" - * because "link local" addresses need only be unique to a specific - * link. Conversely, ordinary unicast addresses might have different - * sin6_scope_id. - * - * The caller should ensure both socket addresses are AF_INET6. - */ -static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; - const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; - - if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr)) - return 0; - else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL) - return sin1->sin6_scope_id == sin2->sin6_scope_id; - - return 1; -} -#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ -static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - return 0; -} -#endif - -/* - * Test if two ip4 socket addresses refer to the same socket, by - * comparing relevant fields. The padding bytes specifically, are - * not compared. - * - * The caller should ensure both socket addresses are AF_INET. - */ -static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; - const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; - - return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; -} - -static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; - const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; - - return nfs_sockaddr_match_ipaddr6(sa1, sa2) && - (sin1->sin6_port == sin2->sin6_port); -} - -static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; - const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; - - return nfs_sockaddr_match_ipaddr4(sa1, sa2) && - (sin1->sin_port == sin2->sin_port); -} - -#if defined(CONFIG_NFS_V4_1) -/* - * Test if two socket addresses represent the same actual socket, - * by comparing (only) relevant fields, excluding the port number. - */ -int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - if (sa1->sa_family != sa2->sa_family) - return 0; - - switch (sa1->sa_family) { - case AF_INET: - return nfs_sockaddr_match_ipaddr4(sa1, sa2); - case AF_INET6: - return nfs_sockaddr_match_ipaddr6(sa1, sa2); - } - return 0; -} -EXPORT_SYMBOL_GPL(nfs_sockaddr_match_ipaddr); -#endif /* CONFIG_NFS_V4_1 */ - -/* - * Test if two socket addresses represent the same actual socket, - * by comparing (only) relevant fields, including the port number. - */ -static int nfs_sockaddr_cmp(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - if (sa1->sa_family != sa2->sa_family) - return 0; - - switch (sa1->sa_family) { - case AF_INET: - return nfs_sockaddr_cmp_ip4(sa1, sa2); - case AF_INET6: - return nfs_sockaddr_cmp_ip6(sa1, sa2); - } - return 0; -} - /* * Find an nfs_client on the list that matches the initialisation data * that is supplied. @@ -421,7 +312,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat if (clp->cl_minorversion != data->minorversion) continue; /* Match the full socket address */ - if (!nfs_sockaddr_cmp(sap, clap)) + if (!rpc_cmp_addr_port(sap, clap)) continue; atomic_inc(&clp->cl_count); @@ -775,7 +666,7 @@ static int nfs_init_server(struct nfs_server *server, server->options = data->options; server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID| NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP| - NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME|NFS_CAP_CHANGE_ATTR; + NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); @@ -825,7 +716,6 @@ error: * Load up the server record from information gained in an fsinfo record */ static void nfs_server_set_fsinfo(struct nfs_server *server, - struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) { unsigned long max_rpc_payload; @@ -901,7 +791,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs if (error < 0) goto out_error; - nfs_server_set_fsinfo(server, mntfh, &fsinfo); + nfs_server_set_fsinfo(server, &fsinfo); /* Get some general file system info */ if (server->namelen == 0) { @@ -1193,8 +1083,6 @@ void nfs_clients_init(struct net *net) } #ifdef CONFIG_PROC_FS -static struct proc_dir_entry *proc_fs_nfs; - static int nfs_server_list_open(struct inode *inode, struct file *file); static void *nfs_server_list_start(struct seq_file *p, loff_t *pos); static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos); @@ -1364,27 +1252,29 @@ static int nfs_volume_list_show(struct seq_file *m, void *v) { struct nfs_server *server; struct nfs_client *clp; - char dev[8], fsid[17]; + char dev[13]; // 8 for 2^24, 1 for ':', 3 for 2^8, 1 for '\0' + char fsid[34]; // 2 * 16 for %llx, 1 for ':', 1 for '\0' struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* display header on line 1 */ if (v == &nn->nfs_volume_list) { - seq_puts(m, "NV SERVER PORT DEV FSID FSC\n"); + seq_puts(m, "NV SERVER PORT DEV FSID" + " FSC\n"); return 0; } /* display one transport per line on subsequent lines */ server = list_entry(v, struct nfs_server, master_link); clp = server->nfs_client; - snprintf(dev, 8, "%u:%u", + snprintf(dev, sizeof(dev), "%u:%u", MAJOR(server->s_dev), MINOR(server->s_dev)); - snprintf(fsid, 17, "%llx:%llx", + snprintf(fsid, sizeof(fsid), "%llx:%llx", (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); rcu_read_lock(); - seq_printf(m, "v%u %s %s %-7s %-17s %s\n", + seq_printf(m, "v%u %s %s %-12s %-33s %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), @@ -1434,27 +1324,20 @@ void nfs_fs_proc_net_exit(struct net *net) */ int __init nfs_fs_proc_init(void) { - struct proc_dir_entry *p; - - proc_fs_nfs = proc_mkdir("fs/nfsfs", NULL); - if (!proc_fs_nfs) + if (!proc_mkdir("fs/nfsfs", NULL)) goto error_0; /* a file of servers with which we're dealing */ - p = proc_symlink("servers", proc_fs_nfs, "../../net/nfsfs/servers"); - if (!p) + if (!proc_symlink("fs/nfsfs/servers", NULL, "../../net/nfsfs/servers")) goto error_1; /* a file of volumes that we have mounted */ - p = proc_symlink("volumes", proc_fs_nfs, "../../net/nfsfs/volumes"); - if (!p) - goto error_2; - return 0; + if (!proc_symlink("fs/nfsfs/volumes", NULL, "../../net/nfsfs/volumes")) + goto error_1; -error_2: - remove_proc_entry("servers", proc_fs_nfs); + return 0; error_1: - remove_proc_entry("fs/nfsfs", NULL); + remove_proc_subtree("fs/nfsfs", NULL); error_0: return -ENOMEM; } @@ -1464,9 +1347,7 @@ error_0: */ void nfs_fs_proc_exit(void) { - remove_proc_entry("volumes", proc_fs_nfs); - remove_proc_entry("servers", proc_fs_nfs); - remove_proc_entry("fs/nfsfs", NULL); + remove_proc_subtree("fs/nfsfs", NULL); } #endif /* CONFIG_PROC_FS */ diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 029d688a969f..2714ef835bdd 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -175,7 +175,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, if (delegation->inode != NULL) { nfs4_stateid_copy(&delegation->stateid, &res->delegation); delegation->type = res->delegation_type; - delegation->maxsize = res->maxsize; + delegation->pagemod_limit = res->pagemod_limit; oldcred = delegation->cred; delegation->cred = get_rpccred(cred); clear_bit(NFS_DELEGATION_NEED_RECLAIM, @@ -337,7 +337,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct return -ENOMEM; nfs4_stateid_copy(&delegation->stateid, &res->delegation); delegation->type = res->delegation_type; - delegation->maxsize = res->maxsize; + delegation->pagemod_limit = res->pagemod_limit; delegation->change_attr = inode->i_version; delegation->cred = get_rpccred(cred); delegation->inode = inode; @@ -900,3 +900,28 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, rcu_read_unlock(); return ret; } + +/** + * nfs4_delegation_flush_on_close - Check if we must flush file on close + * @inode: inode to check + * + * This function checks the number of outstanding writes to the file + * against the delegation 'space_limit' field to see if + * the spec requires us to flush the file on close. + */ +bool nfs4_delegation_flush_on_close(const struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_delegation *delegation; + bool ret = true; + + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation == NULL || !(delegation->type & FMODE_WRITE)) + goto out; + if (nfsi->nrequests < delegation->pagemod_limit) + ret = false; +out: + rcu_read_unlock(); + return ret; +} diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index e3c20a3ccc93..a44829173e57 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -18,7 +18,7 @@ struct nfs_delegation { struct inode *inode; nfs4_stateid stateid; fmode_t type; - loff_t maxsize; + unsigned long pagemod_limit; __u64 change_attr; unsigned long flags; spinlock_t lock; @@ -61,6 +61,7 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); int nfs4_have_delegation(struct inode *inode, fmode_t flags); int nfs4_check_delegation(struct inode *inode, fmode_t flags); +bool nfs4_delegation_flush_on_close(const struct inode *inode); #endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index b2c8b31b2be7..3d8e4ffa0a33 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -583,26 +583,19 @@ out_nopages: } static -void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages) +void nfs_readdir_free_pages(struct page **pages, unsigned int npages) { unsigned int i; for (i = 0; i < npages; i++) put_page(pages[i]); } -static -void nfs_readdir_free_large_page(void *ptr, struct page **pages, - unsigned int npages) -{ - nfs_readdir_free_pagearray(pages, npages); -} - /* * nfs_readdir_large_page will allocate pages that must be freed with a call - * to nfs_readdir_free_large_page + * to nfs_readdir_free_pagearray */ static -int nfs_readdir_large_page(struct page **pages, unsigned int npages) +int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages) { unsigned int i; @@ -615,7 +608,7 @@ int nfs_readdir_large_page(struct page **pages, unsigned int npages) return 0; out_freepages: - nfs_readdir_free_pagearray(pages, i); + nfs_readdir_free_pages(pages, i); return -ENOMEM; } @@ -623,7 +616,6 @@ static int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) { struct page *pages[NFS_MAX_READDIR_PAGES]; - void *pages_ptr = NULL; struct nfs_entry entry; struct file *file = desc->file; struct nfs_cache_array *array; @@ -653,7 +645,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, memset(array, 0, sizeof(struct nfs_cache_array)); array->eof_index = -1; - status = nfs_readdir_large_page(pages, array_size); + status = nfs_readdir_alloc_pages(pages, array_size); if (status < 0) goto out_release_array; do { @@ -671,7 +663,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, } } while (array->eof_index < 0); - nfs_readdir_free_large_page(pages_ptr, pages, array_size); + nfs_readdir_free_pages(pages, array_size); out_release_array: nfs_readdir_release_array(page); out_label_free: @@ -1470,9 +1462,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx, { int err; - if ((open_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) - *opened |= FILE_CREATED; - err = finish_open(file, dentry, do_open, opened); if (err) goto out; @@ -1771,7 +1760,7 @@ EXPORT_SYMBOL_GPL(nfs_mkdir); static void nfs_dentry_handle_enoent(struct dentry *dentry) { - if (d_really_is_positive(dentry) && !d_unhashed(dentry)) + if (simple_positive(dentry)) d_delete(dentry); } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 8b8d83a526ce..c0f9b1ed12b9 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -82,7 +82,8 @@ nfs_file_release(struct inode *inode, struct file *filp) dprintk("NFS: release(%pD2)\n", filp); nfs_inc_stats(inode, NFSIOS_VFSRELEASE); - return nfs_release(inode, filp); + nfs_file_clear_open_context(filp); + return 0; } EXPORT_SYMBOL_GPL(nfs_file_release); @@ -141,7 +142,7 @@ EXPORT_SYMBOL_GPL(nfs_file_llseek); /* * Flush all dirty pages, and check for write errors. */ -int +static int nfs_file_flush(struct file *file, fl_owner_t id) { struct inode *inode = file_inode(file); @@ -152,17 +153,9 @@ nfs_file_flush(struct file *file, fl_owner_t id) if ((file->f_mode & FMODE_WRITE) == 0) return 0; - /* - * If we're holding a write delegation, then just start the i/o - * but don't wait for completion (or send a commit). - */ - if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) - return filemap_fdatawrite(file->f_mapping); - /* Flush writes to the server and return any errors */ return vfs_fsync(file, 0); } -EXPORT_SYMBOL_GPL(nfs_file_flush); ssize_t nfs_file_read(struct kiocb *iocb, struct iov_iter *to) @@ -555,31 +548,22 @@ static int nfs_launder_page(struct page *page) return nfs_wb_page(inode, page); } -#ifdef CONFIG_NFS_SWAP static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { - int ret; struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); *span = sis->pages; - rcu_read_lock(); - ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1); - rcu_read_unlock(); - - return ret; + return rpc_clnt_swap_activate(clnt); } static void nfs_swap_deactivate(struct file *file) { struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); - rcu_read_lock(); - xs_swapper(rcu_dereference(clnt->cl_xprt), 0); - rcu_read_unlock(); + rpc_clnt_swap_deactivate(clnt); } -#endif const struct address_space_operations nfs_file_aops = { .readpage = nfs_readpage, @@ -596,10 +580,8 @@ const struct address_space_operations nfs_file_aops = { .launder_page = nfs_launder_page, .is_dirty_writeback = nfs_check_dirty_writeback, .error_remove_page = generic_error_remove_page, -#ifdef CONFIG_NFS_SWAP .swap_activate = nfs_swap_activate, .swap_deactivate = nfs_swap_deactivate, -#endif }; /* @@ -655,12 +637,10 @@ static const struct vm_operations_struct nfs_file_vm_ops = { .page_mkwrite = nfs_vm_page_mkwrite, }; -static int nfs_need_sync_write(struct file *filp, struct inode *inode) +static int nfs_need_check_write(struct file *filp, struct inode *inode) { struct nfs_open_context *ctx; - if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC)) - return 1; ctx = nfs_file_open_context(filp); if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) || nfs_ctx_key_to_expire(ctx)) @@ -710,8 +690,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (result > 0) written = result; - /* Return error values for O_DSYNC and IS_SYNC() */ - if (result >= 0 && nfs_need_sync_write(file, inode)) { + /* Return error values */ + if (result >= 0 && nfs_need_check_write(file, inode)) { int err = vfs_fsync(file, 0); if (err < 0) result = err; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 7d05089e52d6..fbc5a56de875 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -20,6 +20,7 @@ #include "../nfs4trace.h" #include "../iostat.h" #include "../nfs.h" +#include "../nfs42.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD @@ -33,6 +34,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) ffl = kzalloc(sizeof(*ffl), gfp_flags); if (ffl) { INIT_LIST_HEAD(&ffl->error_list); + INIT_LIST_HEAD(&ffl->mirrors); return &ffl->generic_hdr; } else return NULL; @@ -134,6 +136,95 @@ decode_name(struct xdr_stream *xdr, u32 *id) return 0; } +static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, + const struct nfs4_ff_layout_mirror *m2) +{ + int i, j; + + if (m1->fh_versions_cnt != m2->fh_versions_cnt) + return false; + for (i = 0; i < m1->fh_versions_cnt; i++) { + bool found_fh = false; + for (j = 0; j < m2->fh_versions_cnt; i++) { + if (nfs_compare_fh(&m1->fh_versions[i], + &m2->fh_versions[j]) == 0) { + found_fh = true; + break; + } + } + if (!found_fh) + return false; + } + return true; +} + +static struct nfs4_ff_layout_mirror * +ff_layout_add_mirror(struct pnfs_layout_hdr *lo, + struct nfs4_ff_layout_mirror *mirror) +{ + struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo); + struct nfs4_ff_layout_mirror *pos; + struct inode *inode = lo->plh_inode; + + spin_lock(&inode->i_lock); + list_for_each_entry(pos, &ff_layout->mirrors, mirrors) { + if (mirror->mirror_ds != pos->mirror_ds) + continue; + if (!ff_mirror_match_fh(mirror, pos)) + continue; + if (atomic_inc_not_zero(&pos->ref)) { + spin_unlock(&inode->i_lock); + return pos; + } + } + list_add(&mirror->mirrors, &ff_layout->mirrors); + mirror->layout = lo; + spin_unlock(&inode->i_lock); + return mirror; +} + +static void +ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror) +{ + struct inode *inode; + if (mirror->layout == NULL) + return; + inode = mirror->layout->plh_inode; + spin_lock(&inode->i_lock); + list_del(&mirror->mirrors); + spin_unlock(&inode->i_lock); + mirror->layout = NULL; +} + +static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags) +{ + struct nfs4_ff_layout_mirror *mirror; + + mirror = kzalloc(sizeof(*mirror), gfp_flags); + if (mirror != NULL) { + spin_lock_init(&mirror->lock); + atomic_set(&mirror->ref, 1); + INIT_LIST_HEAD(&mirror->mirrors); + } + return mirror; +} + +static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror) +{ + ff_layout_remove_mirror(mirror); + kfree(mirror->fh_versions); + if (mirror->cred) + put_rpccred(mirror->cred); + nfs4_ff_layout_put_deviceid(mirror->mirror_ds); + kfree(mirror); +} + +static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror) +{ + if (mirror != NULL && atomic_dec_and_test(&mirror->ref)) + ff_layout_free_mirror(mirror); +} + static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls) { int i; @@ -143,11 +234,7 @@ static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls) /* normally mirror_ds is freed in * .free_deviceid_node but we still do it here * for .alloc_lseg error path */ - if (fls->mirror_array[i]) { - kfree(fls->mirror_array[i]->fh_versions); - nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds); - kfree(fls->mirror_array[i]); - } + ff_layout_put_mirror(fls->mirror_array[i]); } kfree(fls->mirror_array); fls->mirror_array = NULL; @@ -180,19 +267,75 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls) } } +static bool +ff_lseg_range_is_after(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) +{ + u64 end1, end2; + + if (l1->iomode != l2->iomode) + return l1->iomode != IOMODE_READ; + end1 = pnfs_calc_offset_end(l1->offset, l1->length); + end2 = pnfs_calc_offset_end(l2->offset, l2->length); + if (end1 < l2->offset) + return false; + if (end2 < l1->offset) + return true; + return l2->offset <= l1->offset; +} + +static bool +ff_lseg_merge(struct pnfs_layout_segment *new, + struct pnfs_layout_segment *old) +{ + u64 new_end, old_end; + + if (new->pls_range.iomode != old->pls_range.iomode) + return false; + old_end = pnfs_calc_offset_end(old->pls_range.offset, + old->pls_range.length); + if (old_end < new->pls_range.offset) + return false; + new_end = pnfs_calc_offset_end(new->pls_range.offset, + new->pls_range.length); + if (new_end < old->pls_range.offset) + return false; + + /* Mergeable: copy info from 'old' to 'new' */ + if (new_end < old_end) + new_end = old_end; + if (new->pls_range.offset < old->pls_range.offset) + new->pls_range.offset = old->pls_range.offset; + new->pls_range.length = pnfs_calc_offset_length(new->pls_range.offset, + new_end); + if (test_bit(NFS_LSEG_ROC, &old->pls_flags)) + set_bit(NFS_LSEG_ROC, &new->pls_flags); + if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags)) + set_bit(NFS_LSEG_LAYOUTRETURN, &new->pls_flags); + return true; +} + +static void +ff_layout_add_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg, + struct list_head *free_me) +{ + pnfs_generic_layout_insert_lseg(lo, lseg, + ff_lseg_range_is_after, + ff_lseg_merge, + free_me); +} + static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls) { - struct nfs4_ff_layout_mirror *tmp; int i, j; for (i = 0; i < fls->mirror_array_cnt - 1; i++) { for (j = i + 1; j < fls->mirror_array_cnt; j++) if (fls->mirror_array[i]->efficiency < - fls->mirror_array[j]->efficiency) { - tmp = fls->mirror_array[i]; - fls->mirror_array[i] = fls->mirror_array[j]; - fls->mirror_array[j] = tmp; - } + fls->mirror_array[j]->efficiency) + swap(fls->mirror_array[i], + fls->mirror_array[j]); } } @@ -248,6 +391,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, goto out_err_free; for (i = 0; i < fls->mirror_array_cnt; i++) { + struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid devid; struct nfs4_deviceid_node *idnode; u32 ds_count; @@ -264,15 +408,12 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, if (ds_count != 1) goto out_err_free; - fls->mirror_array[i] = - kzalloc(sizeof(struct nfs4_ff_layout_mirror), - gfp_flags); + fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags); if (fls->mirror_array[i] == NULL) { rc = -ENOMEM; goto out_err_free; } - spin_lock_init(&fls->mirror_array[i]->lock); fls->mirror_array[i]->ds_count = ds_count; /* deviceid */ @@ -339,11 +480,21 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, if (rc) goto out_err_free; + mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]); + if (mirror != fls->mirror_array[i]) { + ff_layout_free_mirror(fls->mirror_array[i]); + fls->mirror_array[i] = mirror; + } + dprintk("%s: uid %d gid %d\n", __func__, fls->mirror_array[i]->uid, fls->mirror_array[i]->gid); } + p = xdr_inline_decode(&stream, 4); + if (p) + fls->flags = be32_to_cpup(p); + ff_layout_sort_mirrors(fls); rc = ff_layout_check_layout(lgr); if (rc) @@ -376,21 +527,9 @@ static void ff_layout_free_lseg(struct pnfs_layout_segment *lseg) { struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); - int i; dprintk("--> %s\n", __func__); - for (i = 0; i < fls->mirror_array_cnt; i++) { - if (fls->mirror_array[i]) { - nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds); - fls->mirror_array[i]->mirror_ds = NULL; - if (fls->mirror_array[i]->cred) { - put_rpccred(fls->mirror_array[i]->cred); - fls->mirror_array[i]->cred = NULL; - } - } - } - if (lseg->pls_range.iomode == IOMODE_RW) { struct nfs4_flexfile_layout *ffl; struct inode *inode; @@ -415,6 +554,146 @@ ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls) return 1; } +static void +nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now) +{ + /* first IO request? */ + if (atomic_inc_return(&timer->n_ops) == 1) { + timer->start_time = now; + } +} + +static ktime_t +nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now) +{ + ktime_t start; + + if (atomic_dec_return(&timer->n_ops) < 0) + WARN_ON_ONCE(1); + + start = timer->start_time; + timer->start_time = now; + return ktime_sub(now, start); +} + +static bool +nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, + struct nfs4_ff_layoutstat *layoutstat, + ktime_t now) +{ + static const ktime_t notime = {0}; + s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL; + + nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now); + if (ktime_equal(mirror->start_time, notime)) + mirror->start_time = now; + if (ktime_equal(mirror->last_report_time, notime)) + mirror->last_report_time = now; + if (layoutstats_timer != 0) + report_interval = (s64)layoutstats_timer * 1000LL; + if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >= + report_interval) { + mirror->last_report_time = now; + return true; + } + + return false; +} + +static void +nfs4_ff_layout_stat_io_update_requested(struct nfs4_ff_layoutstat *layoutstat, + __u64 requested) +{ + struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat; + + iostat->ops_requested++; + iostat->bytes_requested += requested; +} + +static void +nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat, + __u64 requested, + __u64 completed, + ktime_t time_completed, + ktime_t time_started) +{ + struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat; + ktime_t completion_time = ktime_sub(time_completed, time_started); + ktime_t timer; + + iostat->ops_completed++; + iostat->bytes_completed += completed; + iostat->bytes_not_delivered += requested - completed; + + timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer, time_completed); + iostat->total_busy_time = + ktime_add(iostat->total_busy_time, timer); + iostat->aggregate_completion_time = + ktime_add(iostat->aggregate_completion_time, + completion_time); +} + +static void +nfs4_ff_layout_stat_io_start_read(struct inode *inode, + struct nfs4_ff_layout_mirror *mirror, + __u64 requested, ktime_t now) +{ + bool report; + + spin_lock(&mirror->lock); + report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now); + nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested); + spin_unlock(&mirror->lock); + + if (report) + pnfs_report_layoutstat(inode, GFP_KERNEL); +} + +static void +nfs4_ff_layout_stat_io_end_read(struct rpc_task *task, + struct nfs4_ff_layout_mirror *mirror, + __u64 requested, + __u64 completed) +{ + spin_lock(&mirror->lock); + nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat, + requested, completed, + ktime_get(), task->tk_start); + spin_unlock(&mirror->lock); +} + +static void +nfs4_ff_layout_stat_io_start_write(struct inode *inode, + struct nfs4_ff_layout_mirror *mirror, + __u64 requested, ktime_t now) +{ + bool report; + + spin_lock(&mirror->lock); + report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now); + nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested); + spin_unlock(&mirror->lock); + + if (report) + pnfs_report_layoutstat(inode, GFP_NOIO); +} + +static void +nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, + struct nfs4_ff_layout_mirror *mirror, + __u64 requested, + __u64 completed, + enum nfs3_stable_how committed) +{ + if (committed == NFS_UNSTABLE) + requested = completed = 0; + + spin_lock(&mirror->lock); + nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat, + requested, completed, ktime_get(), task->tk_start); + spin_unlock(&mirror->lock); +} + static int ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, @@ -585,8 +864,6 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg); /* no lseg means that pnfs is not in use, so no mirroring here */ - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; nfs_pageio_reset_write_mds(pgio); return 1; } @@ -631,7 +908,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) nfs_direct_set_resched_writes(hdr->dreq); /* fake unstable write to let common nfs resend pages */ hdr->verf.committed = NFS_UNSTABLE; - hdr->good_bytes = 0; + hdr->good_bytes = hdr->args.count; } return; } @@ -788,18 +1065,26 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, if (task->tk_status >= 0) return 0; - if (task->tk_status != -EJUKEBOX) { + switch (task->tk_status) { + /* File access problems. Don't mark the device as unavailable */ + case -EACCES: + case -ESTALE: + case -EISDIR: + case -EBADHANDLE: + case -ELOOP: + case -ENOSPC: + break; + case -EJUKEBOX: + nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); + goto out_retry; + default: dprintk("%s DS connection error %d\n", __func__, task->tk_status); nfs4_mark_deviceid_unavailable(devid); - if (ff_layout_has_available_ds(lseg)) - return -NFS4ERR_RESET_TO_PNFS; - else - return -NFS4ERR_RESET_TO_MDS; } - - if (task->tk_status == -EJUKEBOX) - nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); + /* FIXME: Need to prevent infinite looping here. */ + return -NFS4ERR_RESET_TO_PNFS; +out_retry: task->tk_status = 0; rpc_restart_call(task); rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); @@ -829,15 +1114,41 @@ static int ff_layout_async_handle_error(struct rpc_task *task, static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, int idx, u64 offset, u64 length, - u32 status, int opnum) + u32 status, int opnum, int error) { struct nfs4_ff_layout_mirror *mirror; int err; + if (status == 0) { + switch (error) { + case -ETIMEDOUT: + case -EPFNOSUPPORT: + case -EPROTONOSUPPORT: + case -EOPNOTSUPP: + case -ECONNREFUSED: + case -ECONNRESET: + case -EHOSTDOWN: + case -EHOSTUNREACH: + case -ENETUNREACH: + case -EADDRINUSE: + case -ENOBUFS: + case -EPIPE: + case -EPERM: + status = NFS4ERR_NXIO; + break; + case -EACCES: + status = NFS4ERR_ACCESS; + break; + default: + return; + } + } + mirror = FF_LAYOUT_COMP(lseg, idx); err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), mirror, offset, length, status, opnum, GFP_NOIO); + pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg); dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status); } @@ -846,16 +1157,14 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, static int ff_layout_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { - struct inode *inode; int err; trace_nfs4_pnfs_read(hdr, task->tk_status); - if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status) - hdr->res.op_status = NFS4ERR_NXIO; - if (task->tk_status < 0 && hdr->res.op_status) + if (task->tk_status < 0) ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, hdr->args.offset, hdr->args.count, - hdr->res.op_status, OP_READ); + hdr->res.op_status, OP_READ, + task->tk_status); err = ff_layout_async_handle_error(task, hdr->args.context->state, hdr->ds_clp, hdr->lseg, hdr->pgio_mirror_idx); @@ -867,8 +1176,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task, pnfs_read_resend_pnfs(hdr); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: - inode = hdr->lseg->pls_layout->plh_inode; - pnfs_error_mark_layout_for_return(inode, hdr->lseg); ff_layout_reset_read(hdr); return task->tk_status; case -EAGAIN: @@ -879,6 +1186,12 @@ static int ff_layout_read_done_cb(struct rpc_task *task, return 0; } +static bool +ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg) +{ + return !(FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_LAYOUTCOMMIT); +} + /* * We reference the rpc_cred of the first WRITE that triggers the need for * a LAYOUTCOMMIT, and use it to send the layoutcommit compound. @@ -891,6 +1204,9 @@ static int ff_layout_read_done_cb(struct rpc_task *task, static void ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) { + if (!ff_layout_need_layoutcommit(hdr->lseg)) + return; + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, hdr->mds_offset + hdr->res.count); dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, @@ -909,6 +1225,11 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx) static int ff_layout_read_prepare_common(struct rpc_task *task, struct nfs_pgio_header *hdr) { + nfs4_ff_layout_stat_io_start_read(hdr->inode, + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count, + task->tk_start); + if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); return -EIO; @@ -962,15 +1283,15 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; - if (ff_layout_read_prepare_common(task, hdr)) - return; - if (ff_layout_setup_sequence(hdr->ds_clp, &hdr->args.seq_args, &hdr->res.seq_res, task)) return; + if (ff_layout_read_prepare_common(task, hdr)) + return; + if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, hdr->args.lock_context, FMODE_READ) == -EIO) rpc_exit(task, -EIO); /* lost lock, terminate I/O */ @@ -982,6 +1303,10 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data) dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); + nfs4_ff_layout_stat_io_end_read(task, + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count, hdr->res.count); + if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && task->tk_status == 0) { nfs4_sequence_done(task, &hdr->res.seq_res); @@ -1003,32 +1328,26 @@ static void ff_layout_read_count_stats(struct rpc_task *task, void *data) static int ff_layout_write_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { - struct inode *inode; int err; trace_nfs4_pnfs_write(hdr, task->tk_status); - if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status) - hdr->res.op_status = NFS4ERR_NXIO; - if (task->tk_status < 0 && hdr->res.op_status) + if (task->tk_status < 0) ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, hdr->args.offset, hdr->args.count, - hdr->res.op_status, OP_WRITE); + hdr->res.op_status, OP_WRITE, + task->tk_status); err = ff_layout_async_handle_error(task, hdr->args.context->state, hdr->ds_clp, hdr->lseg, hdr->pgio_mirror_idx); switch (err) { case -NFS4ERR_RESET_TO_PNFS: + pnfs_set_retry_layoutget(hdr->lseg->pls_layout); + ff_layout_reset_write(hdr, true); + return task->tk_status; case -NFS4ERR_RESET_TO_MDS: - inode = hdr->lseg->pls_layout->plh_inode; - pnfs_error_mark_layout_for_return(inode, hdr->lseg); - if (err == -NFS4ERR_RESET_TO_PNFS) { - pnfs_set_retry_layoutget(hdr->lseg->pls_layout); - ff_layout_reset_write(hdr, true); - } else { - pnfs_clear_retry_layoutget(hdr->lseg->pls_layout); - ff_layout_reset_write(hdr, false); - } + pnfs_clear_retry_layoutget(hdr->lseg->pls_layout); + ff_layout_reset_write(hdr, false); return task->tk_status; case -EAGAIN: rpc_restart_call_prepare(task); @@ -1039,34 +1358,35 @@ static int ff_layout_write_done_cb(struct rpc_task *task, hdr->res.verf->committed == NFS_DATA_SYNC) ff_layout_set_layoutcommit(hdr); + /* zero out fattr since we don't care DS attr at all */ + hdr->fattr.valid = 0; + if (task->tk_status >= 0) + nfs_writeback_update_inode(hdr); + return 0; } static int ff_layout_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data) { - struct inode *inode; int err; trace_nfs4_pnfs_commit_ds(data, task->tk_status); - if (task->tk_status == -ETIMEDOUT && !data->res.op_status) - data->res.op_status = NFS4ERR_NXIO; - if (task->tk_status < 0 && data->res.op_status) + if (task->tk_status < 0) ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index, data->args.offset, data->args.count, - data->res.op_status, OP_COMMIT); + data->res.op_status, OP_COMMIT, + task->tk_status); err = ff_layout_async_handle_error(task, NULL, data->ds_clp, data->lseg, data->ds_commit_index); switch (err) { case -NFS4ERR_RESET_TO_PNFS: + pnfs_set_retry_layoutget(data->lseg->pls_layout); + pnfs_generic_prepare_to_resend_writes(data); + return -EAGAIN; case -NFS4ERR_RESET_TO_MDS: - inode = data->lseg->pls_layout->plh_inode; - pnfs_error_mark_layout_for_return(inode, data->lseg); - if (err == -NFS4ERR_RESET_TO_PNFS) - pnfs_set_retry_layoutget(data->lseg->pls_layout); - else - pnfs_clear_retry_layoutget(data->lseg->pls_layout); + pnfs_clear_retry_layoutget(data->lseg->pls_layout); pnfs_generic_prepare_to_resend_writes(data); return -EAGAIN; case -EAGAIN: @@ -1074,7 +1394,8 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, return -EAGAIN; } - if (data->verf.committed == NFS_UNSTABLE) + if (data->verf.committed == NFS_UNSTABLE + && ff_layout_need_layoutcommit(data->lseg)) pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; @@ -1083,6 +1404,11 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, static int ff_layout_write_prepare_common(struct rpc_task *task, struct nfs_pgio_header *hdr) { + nfs4_ff_layout_stat_io_start_write(hdr->inode, + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count, + task->tk_start); + if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); return -EIO; @@ -1116,15 +1442,15 @@ static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; - if (ff_layout_write_prepare_common(task, hdr)) - return; - if (ff_layout_setup_sequence(hdr->ds_clp, &hdr->args.seq_args, &hdr->res.seq_res, task)) return; + if (ff_layout_write_prepare_common(task, hdr)) + return; + if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, hdr->args.lock_context, FMODE_WRITE) == -EIO) rpc_exit(task, -EIO); /* lost lock, terminate I/O */ @@ -1134,6 +1460,11 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; + nfs4_ff_layout_stat_io_end_write(task, + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count, hdr->res.count, + hdr->res.verf->committed); + if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && task->tk_status == 0) { nfs4_sequence_done(task, &hdr->res.seq_res); @@ -1152,8 +1483,17 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data) &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]); } +static void ff_layout_commit_prepare_common(struct rpc_task *task, + struct nfs_commit_data *cdata) +{ + nfs4_ff_layout_stat_io_start_write(cdata->inode, + FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + 0, task->tk_start); +} + static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data) { + ff_layout_commit_prepare_common(task, data); rpc_call_start(task); } @@ -1161,10 +1501,30 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data) { struct nfs_commit_data *wdata = data; - ff_layout_setup_sequence(wdata->ds_clp, + if (ff_layout_setup_sequence(wdata->ds_clp, &wdata->args.seq_args, &wdata->res.seq_res, - task); + task)) + return; + ff_layout_commit_prepare_common(task, data); +} + +static void ff_layout_commit_done(struct rpc_task *task, void *data) +{ + struct nfs_commit_data *cdata = data; + struct nfs_page *req; + __u64 count = 0; + + if (task->tk_status == 0) { + list_for_each_entry(req, &cdata->pages, wb_list) + count += req->wb_bytes; + } + + nfs4_ff_layout_stat_io_end_write(task, + FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + count, count, NFS_FILE_SYNC); + + pnfs_generic_write_commit_done(task, data); } static void ff_layout_commit_count_stats(struct rpc_task *task, void *data) @@ -1205,14 +1565,14 @@ static const struct rpc_call_ops ff_layout_write_call_ops_v4 = { static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = { .rpc_call_prepare = ff_layout_commit_prepare_v3, - .rpc_call_done = pnfs_generic_write_commit_done, + .rpc_call_done = ff_layout_commit_done, .rpc_count_stats = ff_layout_commit_count_stats, .rpc_release = pnfs_generic_commit_release, }; static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = { .rpc_call_prepare = ff_layout_commit_prepare_v4, - .rpc_call_done = pnfs_generic_write_commit_done, + .rpc_call_done = ff_layout_commit_done, .rpc_count_stats = ff_layout_commit_count_stats, .rpc_release = pnfs_generic_commit_release, }; @@ -1256,7 +1616,6 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) fh = nfs4_ff_layout_select_ds_fh(lseg, idx); if (fh) hdr->args.fh = fh; - /* * Note that if we ever decide to split across DSes, * then we may need to handle dense-like offsets. @@ -1385,6 +1744,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); if (fh) data->args.fh = fh; + return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops, vers == 3 ? &ff_layout_commit_call_ops_v3 : &ff_layout_commit_call_ops_v4, @@ -1488,6 +1848,243 @@ out: dprintk("%s: Return\n", __func__); } +static int +ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + + return snprintf(buf, buflen, "%pI4", &sin->sin_addr); +} + +static size_t +ff_layout_ntop6_noscopeid(const struct sockaddr *sap, char *buf, + const int buflen) +{ + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + const struct in6_addr *addr = &sin6->sin6_addr; + + /* + * RFC 4291, Section 2.2.2 + * + * Shorthanded ANY address + */ + if (ipv6_addr_any(addr)) + return snprintf(buf, buflen, "::"); + + /* + * RFC 4291, Section 2.2.2 + * + * Shorthanded loopback address + */ + if (ipv6_addr_loopback(addr)) + return snprintf(buf, buflen, "::1"); + + /* + * RFC 4291, Section 2.2.3 + * + * Special presentation address format for mapped v4 + * addresses. + */ + if (ipv6_addr_v4mapped(addr)) + return snprintf(buf, buflen, "::ffff:%pI4", + &addr->s6_addr32[3]); + + /* + * RFC 4291, Section 2.2.1 + */ + return snprintf(buf, buflen, "%pI6c", addr); +} + +/* Derived from rpc_sockaddr2uaddr */ +static void +ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da) +{ + struct sockaddr *sap = (struct sockaddr *)&da->da_addr; + char portbuf[RPCBIND_MAXUADDRPLEN]; + char addrbuf[RPCBIND_MAXUADDRLEN]; + char *netid; + unsigned short port; + int len, netid_len; + __be32 *p; + + switch (sap->sa_family) { + case AF_INET: + if (ff_layout_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0) + return; + port = ntohs(((struct sockaddr_in *)sap)->sin_port); + netid = "tcp"; + netid_len = 3; + break; + case AF_INET6: + if (ff_layout_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0) + return; + port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port); + netid = "tcp6"; + netid_len = 4; + break; + default: + /* we only support tcp and tcp6 */ + WARN_ON_ONCE(1); + return; + } + + snprintf(portbuf, sizeof(portbuf), ".%u.%u", port >> 8, port & 0xff); + len = strlcat(addrbuf, portbuf, sizeof(addrbuf)); + + p = xdr_reserve_space(xdr, 4 + netid_len); + xdr_encode_opaque(p, netid, netid_len); + + p = xdr_reserve_space(xdr, 4 + len); + xdr_encode_opaque(p, addrbuf, len); +} + +static void +ff_layout_encode_nfstime(struct xdr_stream *xdr, + ktime_t t) +{ + struct timespec64 ts; + __be32 *p; + + p = xdr_reserve_space(xdr, 12); + ts = ktime_to_timespec64(t); + p = xdr_encode_hyper(p, ts.tv_sec); + *p++ = cpu_to_be32(ts.tv_nsec); +} + +static void +ff_layout_encode_io_latency(struct xdr_stream *xdr, + struct nfs4_ff_io_stat *stat) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 5 * 8); + p = xdr_encode_hyper(p, stat->ops_requested); + p = xdr_encode_hyper(p, stat->bytes_requested); + p = xdr_encode_hyper(p, stat->ops_completed); + p = xdr_encode_hyper(p, stat->bytes_completed); + p = xdr_encode_hyper(p, stat->bytes_not_delivered); + ff_layout_encode_nfstime(xdr, stat->total_busy_time); + ff_layout_encode_nfstime(xdr, stat->aggregate_completion_time); +} + +static void +ff_layout_encode_layoutstats(struct xdr_stream *xdr, + struct nfs42_layoutstat_args *args, + struct nfs42_layoutstat_devinfo *devinfo) +{ + struct nfs4_ff_layout_mirror *mirror = devinfo->layout_private; + struct nfs4_pnfs_ds_addr *da; + struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds; + struct nfs_fh *fh = &mirror->fh_versions[0]; + __be32 *p, *start; + + da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node); + dprintk("%s: DS %s: encoding address %s\n", + __func__, ds->ds_remotestr, da->da_remotestr); + /* layoutupdate length */ + start = xdr_reserve_space(xdr, 4); + /* netaddr4 */ + ff_layout_encode_netaddr(xdr, da); + /* nfs_fh4 */ + p = xdr_reserve_space(xdr, 4 + fh->size); + xdr_encode_opaque(p, fh->data, fh->size); + /* ff_io_latency4 read */ + spin_lock(&mirror->lock); + ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat); + /* ff_io_latency4 write */ + ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat); + spin_unlock(&mirror->lock); + /* nfstime4 */ + ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time)); + /* bool */ + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(false); + + *start = cpu_to_be32((xdr->p - start - 1) * 4); +} + +static int +ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args, + struct pnfs_layout_hdr *lo, + int dev_limit) +{ + struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo); + struct nfs4_ff_layout_mirror *mirror; + struct nfs4_deviceid_node *dev; + struct nfs42_layoutstat_devinfo *devinfo; + int i = 0; + + list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) { + if (i >= dev_limit) + break; + if (!mirror->mirror_ds) + continue; + /* mirror refcount put in cleanup_layoutstats */ + if (!atomic_inc_not_zero(&mirror->ref)) + continue; + dev = &mirror->mirror_ds->id_node; + devinfo = &args->devinfo[i]; + memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE); + devinfo->offset = 0; + devinfo->length = NFS4_MAX_UINT64; + devinfo->read_count = mirror->read_stat.io_stat.ops_completed; + devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed; + devinfo->write_count = mirror->write_stat.io_stat.ops_completed; + devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed; + devinfo->layout_type = LAYOUT_FLEX_FILES; + devinfo->layoutstats_encode = ff_layout_encode_layoutstats; + devinfo->layout_private = mirror; + + i++; + } + return i; +} + +static int +ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) +{ + struct nfs4_flexfile_layout *ff_layout; + struct nfs4_ff_layout_mirror *mirror; + int dev_count = 0; + + spin_lock(&args->inode->i_lock); + ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout); + list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) { + if (atomic_read(&mirror->ref) != 0) + dev_count ++; + } + spin_unlock(&args->inode->i_lock); + /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */ + if (dev_count > PNFS_LAYOUTSTATS_MAXDEV) { + dprintk("%s: truncating devinfo to limit (%d:%d)\n", + __func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV); + dev_count = PNFS_LAYOUTSTATS_MAXDEV; + } + args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO); + if (!args->devinfo) + return -ENOMEM; + + spin_lock(&args->inode->i_lock); + args->num_dev = ff_layout_mirror_prepare_stats(args, + &ff_layout->generic_hdr, dev_count); + spin_unlock(&args->inode->i_lock); + + return 0; +} + +static void +ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data) +{ + struct nfs4_ff_layout_mirror *mirror; + int i; + + for (i = 0; i < data->args.num_dev; i++) { + mirror = data->args.devinfo[i].layout_private; + data->args.devinfo[i].layout_private = NULL; + ff_layout_put_mirror(mirror); + } +} + static struct pnfs_layoutdriver_type flexfilelayout_type = { .id = LAYOUT_FLEX_FILES, .name = "LAYOUT_FLEX_FILES", @@ -1496,6 +2093,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .free_layout_hdr = ff_layout_free_layout_hdr, .alloc_lseg = ff_layout_alloc_lseg, .free_lseg = ff_layout_free_lseg, + .add_lseg = ff_layout_add_lseg, .pg_read_ops = &ff_layout_pg_read_ops, .pg_write_ops = &ff_layout_pg_write_ops, .get_ds_info = ff_layout_get_ds_info, @@ -1510,6 +2108,8 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .alloc_deviceid_node = ff_layout_alloc_deviceid_node, .encode_layoutreturn = ff_layout_encode_layoutreturn, .sync = pnfs_nfs_generic_sync, + .prepare_layoutstats = ff_layout_prepare_layoutstats, + .cleanup_layoutstats = ff_layout_cleanup_layoutstats, }; static int __init nfs4flexfilelayout_init(void) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 070f20445b2d..68cc0d9828f9 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -9,12 +9,17 @@ #ifndef FS_NFS_NFS4FLEXFILELAYOUT_H #define FS_NFS_NFS4FLEXFILELAYOUT_H +#define FF_FLAGS_NO_LAYOUTCOMMIT 1 + #include "../pnfs.h" /* XXX: Let's filter out insanely large mirror count for now to avoid oom * due to network error etc. */ #define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096 +/* LAYOUTSTATS report interval in ms */ +#define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L) + struct nfs4_ff_ds_version { u32 version; u32 minor_version; @@ -41,24 +46,50 @@ struct nfs4_ff_layout_ds_err { struct nfs4_deviceid deviceid; }; +struct nfs4_ff_io_stat { + __u64 ops_requested; + __u64 bytes_requested; + __u64 ops_completed; + __u64 bytes_completed; + __u64 bytes_not_delivered; + ktime_t total_busy_time; + ktime_t aggregate_completion_time; +}; + +struct nfs4_ff_busy_timer { + ktime_t start_time; + atomic_t n_ops; +}; + +struct nfs4_ff_layoutstat { + struct nfs4_ff_io_stat io_stat; + struct nfs4_ff_busy_timer busy_timer; +}; + struct nfs4_ff_layout_mirror { + struct pnfs_layout_hdr *layout; + struct list_head mirrors; u32 ds_count; u32 efficiency; struct nfs4_ff_layout_ds *mirror_ds; u32 fh_versions_cnt; struct nfs_fh *fh_versions; nfs4_stateid stateid; - struct nfs4_string user_name; - struct nfs4_string group_name; u32 uid; u32 gid; struct rpc_cred *cred; + atomic_t ref; spinlock_t lock; + struct nfs4_ff_layoutstat read_stat; + struct nfs4_ff_layoutstat write_stat; + ktime_t start_time; + ktime_t last_report_time; }; struct nfs4_ff_layout_segment { struct pnfs_layout_segment generic_hdr; u64 stripe_unit; + u32 flags; u32 mirror_array_cnt; struct nfs4_ff_layout_mirror **mirror_array; }; @@ -66,6 +97,7 @@ struct nfs4_ff_layout_segment { struct nfs4_flexfile_layout { struct pnfs_layout_hdr generic_hdr; struct pnfs_ds_commit_info commit_info; + struct list_head mirrors; struct list_head error_list; /* nfs4_ff_layout_ds_err */ }; diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 77a2d026aa12..e125e55de86d 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -172,6 +172,32 @@ out_err: return NULL; } +static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg, + struct nfs4_deviceid_node *devid) +{ + nfs4_mark_deviceid_unavailable(devid); + if (!ff_layout_has_available_ds(lseg)) + pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, + lseg); +} + +static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg, + struct nfs4_ff_layout_mirror *mirror) +{ + if (mirror == NULL || mirror->mirror_ds == NULL) { + pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, + lseg); + return false; + } + if (mirror->mirror_ds->ds == NULL) { + struct nfs4_deviceid_node *devid; + devid = &mirror->mirror_ds->id_node; + ff_layout_mark_devid_invalid(lseg, devid); + return false; + } + return true; +} + static u64 end_offset(u64 start, u64 len) { @@ -324,7 +350,8 @@ static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror, __func__, PTR_ERR(cred)); return PTR_ERR(cred); } else { - mirror->cred = cred; + if (cmpxchg(&mirror->cred, NULL, cred)) + put_rpccred(cred); } } return 0; @@ -335,16 +362,10 @@ nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx) { struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx); struct nfs_fh *fh = NULL; - struct nfs4_deviceid_node *devid; - if (mirror == NULL || mirror->mirror_ds == NULL || - mirror->mirror_ds->ds == NULL) { - printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n", + if (!ff_layout_mirror_valid(lseg, mirror)) { + pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n", __func__, mirror_idx); - if (mirror && mirror->mirror_ds) { - devid = &mirror->mirror_ds->id_node; - pnfs_generic_mark_devid_invalid(devid); - } goto out; } @@ -367,14 +388,9 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, unsigned int max_payload; rpc_authflavor_t flavor; - if (mirror == NULL || mirror->mirror_ds == NULL || - mirror->mirror_ds->ds == NULL) { - printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", + if (!ff_layout_mirror_valid(lseg, mirror)) { + pr_err_ratelimited("NFS: %s: No data server for offset index %d\n", __func__, ds_idx); - if (mirror && mirror->mirror_ds) { - devid = &mirror->mirror_ds->id_node; - pnfs_generic_mark_devid_invalid(devid); - } goto out; } @@ -386,7 +402,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ smp_rmb(); if (ds->ds_clp) - goto out; + goto out_update_creds; flavor = nfs4_ff_layout_choose_authflavor(mirror); @@ -430,7 +446,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, } } } - +out_update_creds: if (ff_layout_update_mirror_cred(mirror, ds)) ds = NULL; out: @@ -499,16 +515,19 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo, range->offset, range->length)) continue; /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE) - * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4) + * + array length + deviceid(NFS4_DEVICEID4_SIZE) + * + status(4) + opnum(4) */ p = xdr_reserve_space(xdr, - 24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE); + 28 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE); if (unlikely(!p)) return -ENOBUFS; p = xdr_encode_hyper(p, err->offset); p = xdr_encode_hyper(p, err->length); p = xdr_encode_opaque_fixed(p, &err->stateid, NFS4_STATEID_SIZE); + /* Encode 1 error */ + *p++ = cpu_to_be32(1); p = xdr_encode_opaque_fixed(p, &err->deviceid, NFS4_DEVICEID4_SIZE); *p++ = cpu_to_be32(err->status); @@ -524,11 +543,11 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo, return 0; } -bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) +static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg) { struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid_node *devid; - int idx; + u32 idx; for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { mirror = FF_LAYOUT_COMP(lseg, idx); @@ -542,6 +561,32 @@ bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) return false; } +static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg) +{ + struct nfs4_ff_layout_mirror *mirror; + struct nfs4_deviceid_node *devid; + u32 idx; + + for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { + mirror = FF_LAYOUT_COMP(lseg, idx); + if (!mirror || !mirror->mirror_ds) + return false; + devid = &mirror->mirror_ds->id_node; + if (ff_layout_test_devid_unavailable(devid)) + return false; + } + + return FF_LAYOUT_MIRROR_COUNT(lseg) != 0; +} + +bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) +{ + if (lseg->pls_range.iomode == IOMODE_READ) + return ff_read_layout_has_available_ds(lseg); + /* Note: RW layout needs all mirrors available */ + return ff_rw_layout_has_available_ds(lseg); +} + module_param(dataserver_retrans, uint, 0644); MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client " "retries a request before it attempts further " diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index f734562c6d24..326d9e10d833 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -442,8 +442,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) inode->i_version = fattr->change_attr; - else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_PAGECACHE); if (fattr->valid & NFS_ATTR_FATTR_SIZE) inode->i_size = nfs_size_to_loff_t(fattr->size); else @@ -503,7 +504,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct nfs_fattr *fattr; - int error = -ENOMEM; + int error = 0; nfs_inc_stats(inode, NFSIOS_VFSSETATTR); @@ -512,15 +513,14 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_valid &= ~ATTR_MODE; if (attr->ia_valid & ATTR_SIZE) { - loff_t i_size; - BUG_ON(!S_ISREG(inode->i_mode)); - i_size = i_size_read(inode); - if (attr->ia_size == i_size) + error = inode_newsize_ok(inode, attr->ia_size); + if (error) + return error; + + if (attr->ia_size == i_size_read(inode)) attr->ia_valid &= ~ATTR_SIZE; - else if (attr->ia_size < i_size && IS_SWAPFILE(inode)) - return -ETXTBSY; } /* Optimization: if the end result is no change, don't RPC */ @@ -535,8 +535,11 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) nfs_sync_inode(inode); fattr = nfs_alloc_fattr(); - if (fattr == NULL) + if (fattr == NULL) { + error = -ENOMEM; goto out; + } + /* * Return any delegations if we're going to change ACLs */ @@ -678,6 +681,8 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) if (!err) { generic_fillattr(inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); + if (S_ISDIR(inode->i_mode)) + stat->blksize = NFS_SERVER(inode)->dtsize; } out: trace_nfs_getattr_exit(inode, err); @@ -756,11 +761,13 @@ EXPORT_SYMBOL_GPL(nfs_put_lock_context); * @ctx: pointer to context * @is_sync: is this a synchronous close * - * always ensure that the attributes are up to date if we're mounted - * with close-to-open semantics + * Ensure that the attributes are up to date if we're mounted + * with close-to-open semantics and we have cached data that will + * need to be revalidated on open. */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) { + struct nfs_inode *nfsi; struct inode *inode; struct nfs_server *server; @@ -769,7 +776,12 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) if (!is_sync) return; inode = d_inode(ctx->dentry); - if (!list_empty(&NFS_I(inode)->open_files)) + nfsi = NFS_I(inode); + if (inode->i_mapping->nrpages == 0) + return; + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + return; + if (!list_empty(&nfsi->open_files)) return; server = NFS_SERVER(inode); if (server->flags & NFS_MOUNT_NOCTO) @@ -841,6 +853,11 @@ void put_nfs_open_context(struct nfs_open_context *ctx) } EXPORT_SYMBOL_GPL(put_nfs_open_context); +static void put_nfs_open_context_sync(struct nfs_open_context *ctx) +{ + __put_nfs_open_context(ctx, 1); +} + /* * Ensure that mmap has a recent RPC credential for use when writing out * shared pages @@ -885,7 +902,7 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c return ctx; } -static void nfs_file_clear_open_context(struct file *filp) +void nfs_file_clear_open_context(struct file *filp) { struct nfs_open_context *ctx = nfs_file_open_context(filp); @@ -896,7 +913,7 @@ static void nfs_file_clear_open_context(struct file *filp) spin_lock(&inode->i_lock); list_move_tail(&ctx->list, &NFS_I(inode)->open_files); spin_unlock(&inode->i_lock); - __put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1); + put_nfs_open_context_sync(ctx); } } @@ -916,12 +933,6 @@ int nfs_open(struct inode *inode, struct file *filp) return 0; } -int nfs_release(struct inode *inode, struct file *filp) -{ - nfs_file_clear_open_context(filp); - return 0; -} - /* * This function is called whenever some part of NFS notices that * the cached attributes have to be refreshed. @@ -1242,9 +1253,11 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if (fattr->valid & NFS_ATTR_FATTR_SIZE) { cur_size = i_size_read(inode); new_isize = nfs_size_to_loff_t(fattr->size); - if (cur_size != new_isize && nfsi->nrequests == 0) + if (cur_size != new_isize) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; } + if (nfsi->nrequests != 0) + invalid &= ~NFS_INO_REVAL_PAGECACHE; /* Have any file permissions changed? */ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) @@ -1268,13 +1281,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat return 0; } -static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr) -{ - if (!(fattr->valid & NFS_ATTR_FATTR_CTIME)) - return 0; - return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; -} - static atomic_long_t nfs_attr_generation_counter; static unsigned long nfs_read_attr_generation_counter(void) @@ -1423,7 +1429,6 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n const struct nfs_inode *nfsi = NFS_I(inode); return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 || - nfs_ctime_need_update(inode, fattr) || ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); } @@ -1486,6 +1491,13 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr { unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + /* + * Don't revalidate the pagecache if we hold a delegation, but do + * force an attribute update + */ + if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_FORCED; + if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; nfs_set_cache_invalid(inode, invalid); @@ -1682,13 +1694,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_REVAL_PAGECACHE; + | NFS_INO_INVALID_ACL; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); inode->i_version = fattr->change_attr; } - } else if (server->caps & NFS_CAP_CHANGE_ATTR) + } else nfsi->cache_validity |= save_cache_validity; if (fattr->valid & NFS_ATTR_FATTR_MTIME) { @@ -1715,7 +1726,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if ((nfsi->nrequests == 0) || new_isize > cur_isize) { i_size_write(inode, new_isize); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; - invalid &= ~NFS_INO_REVAL_PAGECACHE; } dprintk("NFS: isize change on server for file %s/%ld " "(%Ld to %Ld)\n", @@ -2008,17 +2018,15 @@ static int __init init_nfs_fs(void) if (err) goto out1; -#ifdef CONFIG_PROC_FS rpc_proc_register(&init_net, &nfs_rpcstat); -#endif - if ((err = register_nfs_fs()) != 0) + + err = register_nfs_fs(); + if (err) goto out0; return 0; out0: -#ifdef CONFIG_PROC_FS rpc_proc_unregister(&init_net, "nfs"); -#endif nfs_destroy_directcache(); out1: nfs_destroy_writepagecache(); @@ -2049,9 +2057,7 @@ static void __exit exit_nfs_fs(void) nfs_destroy_nfspagecache(); nfs_fscache_unregister(); unregister_pernet_subsys(&nfs_net_ops); -#ifdef CONFIG_PROC_FS rpc_proc_unregister(&init_net, "nfs"); -#endif unregister_nfs_fs(); nfs_fs_proc_exit(); nfsiod_stop(); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7e3c4604bea8..56cfde26fb9c 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -219,10 +219,6 @@ static inline void nfs_fs_proc_exit(void) } #endif -#ifdef CONFIG_NFS_V4_1 -int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *); -#endif - /* callback_xdr.c */ extern struct svc_version nfs4_callback_version1; extern struct svc_version nfs4_callback_version4; @@ -296,6 +292,22 @@ extern struct rpc_procinfo nfs4_procedures[]; #ifdef CONFIG_NFS_V4_SECURITY_LABEL extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags); +static inline struct nfs4_label * +nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) +{ + if (!dst || !src) + return NULL; + + if (src->len > NFS4_MAXLABELLEN) + return NULL; + + dst->lfs = src->lfs; + dst->pi = src->pi; + dst->len = src->len; + memcpy(dst->label, src->label, src->len); + + return dst; +} static inline void nfs4_label_free(struct nfs4_label *label) { if (label) { @@ -316,6 +328,11 @@ static inline void nfs4_label_free(void *label) {} static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) { } +static inline struct nfs4_label * +nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) +{ + return NULL; +} #endif /* CONFIG_NFS_V4_SECURITY_LABEL */ /* proc.c */ @@ -343,7 +360,6 @@ int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *) /* file.c */ int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int); loff_t nfs_file_llseek(struct file *, loff_t, int); -int nfs_file_flush(struct file *, fl_owner_t); ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); @@ -469,6 +485,9 @@ void nfs_retry_commit(struct list_head *page_list, void nfs_commitdata_release(struct nfs_commit_data *data); void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, struct nfs_commit_info *cinfo); +void nfs_request_add_commit_list_locked(struct nfs_page *req, + struct list_head *dst, + struct nfs_commit_info *cinfo); void nfs_request_remove_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo); void nfs_init_cinfo(struct nfs_commit_info *cinfo, @@ -602,13 +621,15 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) * Record the page as unstable and mark its inode as dirty. */ static inline -void nfs_mark_page_unstable(struct page *page) +void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) { - struct inode *inode = page_file_mapping(page)->host; + if (!cinfo->dreq) { + struct inode *inode = page_file_mapping(page)->host; - inc_zone_page_state(page, NR_UNSTABLE_NFS); - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + inc_zone_page_state(page, NR_UNSTABLE_NFS); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + } } /* diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 53852a4bd88b..267126d32ec0 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1103,6 +1103,7 @@ static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req, { encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen); encode_symlinkdata3(xdr, args); + xdr->buf->flags |= XDRBUF_WRITE; } /* @@ -1342,7 +1343,7 @@ static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req, if (args->npages != 0) xdr_write_pages(xdr, args->pages, 0, args->len); else - xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE); + xdr_reserve_space(xdr, args->len); error = nfsacl_encode(xdr->buf, base, args->inode, (args->mask & NFS_ACL) ? diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index 7afb8947dfdf..814c1255f1d2 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -5,12 +5,17 @@ #ifndef __LINUX_FS_NFS_NFS4_2_H #define __LINUX_FS_NFS_NFS4_2_H +/* + * FIXME: four LAYOUTSTATS calls per compound at most! Do we need to support + * more? Need to consider not to pre-alloc too much for a compound. + */ +#define PNFS_LAYOUTSTATS_MAXDEV (4) + /* nfs4.2proc.c */ int nfs42_proc_allocate(struct file *, loff_t, loff_t); int nfs42_proc_deallocate(struct file *, loff_t, loff_t); loff_t nfs42_proc_llseek(struct file *, loff_t, int); - -/* nfs4.2xdr.h */ -extern struct rpc_procinfo nfs4_2_procedures[]; +int nfs42_proc_layoutstats_generic(struct nfs_server *, + struct nfs42_layoutstat_data *); #endif /* __LINUX_FS_NFS_NFS4_2_H */ diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 3a9e75235f30..d731bbf974aa 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -10,6 +10,11 @@ #include <linux/nfs_fs.h> #include "nfs4_fs.h" #include "nfs42.h" +#include "iostat.h" +#include "pnfs.h" +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file, fmode_t fmode) @@ -130,7 +135,7 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) return err; } -loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) +static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) { struct inode *inode = file_inode(filep); struct nfs42_seek_args args = { @@ -165,3 +170,102 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes); } + +loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) +{ + struct nfs_server *server = NFS_SERVER(file_inode(filep)); + struct nfs4_exception exception = { }; + int err; + + do { + err = _nfs42_proc_llseek(filep, offset, whence); + if (err == -ENOTSUPP) + return -EOPNOTSUPP; + err = nfs4_handle_exception(server, err, &exception); + } while (exception.retry); + + return err; +} + + +static void +nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs42_layoutstat_data *data = calldata; + struct nfs_server *server = NFS_SERVER(data->args.inode); + + nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args, + &data->res.seq_res, task); +} + +static void +nfs42_layoutstat_done(struct rpc_task *task, void *calldata) +{ + struct nfs42_layoutstat_data *data = calldata; + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; + + switch (task->tk_status) { + case 0: + break; + case -ENOTSUPP: + case -EOPNOTSUPP: + NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS; + default: + dprintk("%s server returns %d\n", __func__, task->tk_status); + } +} + +static void +nfs42_layoutstat_release(void *calldata) +{ + struct nfs42_layoutstat_data *data = calldata; + struct nfs_server *nfss = NFS_SERVER(data->args.inode); + + if (nfss->pnfs_curr_ld->cleanup_layoutstats) + nfss->pnfs_curr_ld->cleanup_layoutstats(data); + + pnfs_put_layout_hdr(NFS_I(data->args.inode)->layout); + smp_mb__before_atomic(); + clear_bit(NFS_INO_LAYOUTSTATS, &NFS_I(data->args.inode)->flags); + smp_mb__after_atomic(); + nfs_iput_and_deactive(data->inode); + kfree(data->args.devinfo); + kfree(data); +} + +static const struct rpc_call_ops nfs42_layoutstat_ops = { + .rpc_call_prepare = nfs42_layoutstat_prepare, + .rpc_call_done = nfs42_layoutstat_done, + .rpc_release = nfs42_layoutstat_release, +}; + +int nfs42_proc_layoutstats_generic(struct nfs_server *server, + struct nfs42_layoutstat_data *data) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTSTATS], + .rpc_argp = &data->args, + .rpc_resp = &data->res, + }; + struct rpc_task_setup task_setup = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs42_layoutstat_ops, + .callback_data = data, + .flags = RPC_TASK_ASYNC, + }; + struct rpc_task *task; + + data->inode = nfs_igrab_and_active(data->args.inode); + if (!data->inode) { + nfs42_layoutstat_release(data); + return -EAGAIN; + } + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + task = rpc_run_task(&task_setup); + if (IS_ERR(task)) + return PTR_ERR(task); + return 0; +} diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 1a25b27248f2..0eb29e14070d 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -4,6 +4,8 @@ #ifndef __LINUX_FS_NFS_NFS4_2XDR_H #define __LINUX_FS_NFS_NFS4_2XDR_H +#include "nfs42.h" + #define encode_fallocate_maxsz (encode_stateid_maxsz + \ 2 /* offset */ + \ 2 /* length */) @@ -22,6 +24,16 @@ 1 /* whence */ + \ 2 /* offset */ + \ 2 /* length */) +#define encode_io_info_maxsz 4 +#define encode_layoutstats_maxsz (op_decode_hdr_maxsz + \ + 2 /* offset */ + \ + 2 /* length */ + \ + encode_stateid_maxsz + \ + encode_io_info_maxsz + \ + encode_io_info_maxsz + \ + 1 /* opaque devaddr4 length */ + \ + XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE)) +#define decode_layoutstats_maxsz (op_decode_hdr_maxsz) #define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ @@ -45,6 +57,14 @@ #define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ decode_seek_maxsz) +#define NFS4_enc_layoutstats_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + PNFS_LAYOUTSTATS_MAXDEV * encode_layoutstats_maxsz) +#define NFS4_dec_layoutstats_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz) static void encode_fallocate(struct xdr_stream *xdr, @@ -81,6 +101,33 @@ static void encode_seek(struct xdr_stream *xdr, encode_uint32(xdr, args->sa_what); } +static void encode_layoutstats(struct xdr_stream *xdr, + struct nfs42_layoutstat_args *args, + struct nfs42_layoutstat_devinfo *devinfo, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_LAYOUTSTATS, decode_layoutstats_maxsz, hdr); + p = reserve_space(xdr, 8 + 8); + p = xdr_encode_hyper(p, devinfo->offset); + p = xdr_encode_hyper(p, devinfo->length); + encode_nfs4_stateid(xdr, &args->stateid); + p = reserve_space(xdr, 4*8 + NFS4_DEVICEID4_SIZE + 4); + p = xdr_encode_hyper(p, devinfo->read_count); + p = xdr_encode_hyper(p, devinfo->read_bytes); + p = xdr_encode_hyper(p, devinfo->write_count); + p = xdr_encode_hyper(p, devinfo->write_bytes); + p = xdr_encode_opaque_fixed(p, devinfo->dev_id.data, + NFS4_DEVICEID4_SIZE); + /* Encode layoutupdate4 */ + *p++ = cpu_to_be32(devinfo->layout_type); + if (devinfo->layoutstats_encode != NULL) + devinfo->layoutstats_encode(xdr, args, devinfo); + else + encode_uint32(xdr, 0); +} + /* * Encode ALLOCATE request */ @@ -137,6 +184,28 @@ static void nfs4_xdr_enc_seek(struct rpc_rqst *req, encode_nops(&hdr); } +/* + * Encode LAYOUTSTATS request + */ +static void nfs4_xdr_enc_layoutstats(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs42_layoutstat_args *args) +{ + int i; + + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + WARN_ON(args->num_dev > PNFS_LAYOUTSTATS_MAXDEV); + for (i = 0; i < args->num_dev; i++) + encode_layoutstats(xdr, args, &args->devinfo[i], &hdr); + encode_nops(&hdr); +} + static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) { return decode_op_hdr(xdr, OP_ALLOCATE); @@ -169,6 +238,11 @@ out_overflow: return -EIO; } +static int decode_layoutstats(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_LAYOUTSTATS); +} + /* * Decode ALLOCATE request */ @@ -246,4 +320,35 @@ static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp, out: return status; } + +/* + * Decode LAYOUTSTATS request + */ +static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs42_layoutstat_res *res) +{ + struct compound_hdr hdr; + int status, i; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV); + for (i = 0; i < res->num_dev; i++) { + status = decode_layoutstats(xdr); + if (status) + goto out; + } +out: + res->rpc_status = status; + return status; +} + #endif /* __LINUX_FS_NFS_NFS4_2XDR_H */ diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index fdef424b0cd3..50cfc4ca7a02 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -233,6 +233,7 @@ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *, struct rpc_message *, struct nfs4_sequence_args *, struct nfs4_sequence_res *, int); +extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int); extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool); @@ -404,9 +405,7 @@ int nfs40_discover_server_trunking(struct nfs_client *clp, int nfs41_discover_server_trunking(struct nfs_client *clp, struct nfs_client **, struct rpc_cred *); extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); -extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp); -extern void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp); - +extern void nfs41_notify_server(struct nfs_client *); #else static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) { diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index e42be52a8c18..223bedda64ae 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -676,7 +676,6 @@ found: break; } - /* No matching nfs_client found. */ spin_unlock(&nn->nfs_client_lock); dprintk("NFS: <-- %s status = %d\n", __func__, status); nfs_put_client(prev); @@ -730,10 +729,7 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr, return false; /* Match only the IP address, not the port number */ - if (!nfs_sockaddr_match_ipaddr(addr, clap)) - return false; - - return true; + return rpc_cmp_addr(addr, clap); } /* diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index f58c17b3b480..b0dbe0abed53 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -6,7 +6,9 @@ #include <linux/fs.h> #include <linux/falloc.h> #include <linux/nfs_fs.h> +#include "delegation.h" #include "internal.h" +#include "iostat.h" #include "fscache.h" #include "pnfs.h" @@ -27,7 +29,6 @@ nfs4_file_open(struct inode *inode, struct file *filp) struct inode *dir; unsigned openflags = filp->f_flags; struct iattr attr; - int opened = 0; int err; /* @@ -41,6 +42,10 @@ nfs4_file_open(struct inode *inode, struct file *filp) dprintk("NFS: open file(%pd2)\n", dentry); + err = nfs_check_flags(openflags); + if (err) + return err; + if ((openflags & O_ACCMODE) == 3) openflags--; @@ -62,7 +67,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) nfs_sync_inode(inode); } - inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened); + inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL); if (IS_ERR(inode)) { err = PTR_ERR(inode); switch (err) { @@ -96,6 +101,31 @@ out_drop: goto out_put_ctx; } +/* + * Flush all dirty pages, and check for write errors. + */ +static int +nfs4_file_flush(struct file *file, fl_owner_t id) +{ + struct inode *inode = file_inode(file); + + dprintk("NFS: flush(%pD2)\n", file); + + nfs_inc_stats(inode, NFSIOS_VFSFLUSH); + if ((file->f_mode & FMODE_WRITE) == 0) + return 0; + + /* + * If we're holding a write delegation, then check if we're required + * to flush the i/o on close. If not, then just start the i/o now. + */ + if (!nfs4_delegation_flush_on_close(inode)) + return filemap_fdatawrite(file->f_mapping); + + /* Flush writes to the server and return any errors */ + return vfs_fsync(file, 0); +} + static int nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { @@ -174,7 +204,7 @@ const struct file_operations nfs4_file_operations = { .write_iter = nfs_file_write, .mmap = nfs_file_mmap, .open = nfs4_file_open, - .flush = nfs_file_flush, + .flush = nfs4_file_flush, .release = nfs_file_release, .fsync = nfs4_file_fsync, .lock = nfs_lock, diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c index c0b3a16b4a00..039b3eb6d834 100644 --- a/fs/nfs/nfs4getroot.c +++ b/fs/nfs/nfs4getroot.c @@ -35,13 +35,6 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p goto out; } - if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { - printk(KERN_ERR "nfs4_get_rootfh:" - " getroot obtained referral\n"); - ret = -EREMOTE; - goto out; - } - memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); out: nfs_free_fattr(fsinfo.fattr); diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 2e1737c40a29..2e4902203c35 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -184,7 +184,7 @@ static struct key_type key_type_id_resolver = { .read = user_read, }; -static int nfs_idmap_init_keyring(void) +int nfs_idmap_init(void) { struct cred *cred; struct key *keyring; @@ -230,7 +230,7 @@ failed_put_cred: return ret; } -static void nfs_idmap_quit_keyring(void) +void nfs_idmap_quit(void) { key_revoke(id_resolver_cache->thread_keyring); unregister_key_type(&key_type_id_resolver); @@ -492,21 +492,6 @@ nfs_idmap_delete(struct nfs_client *clp) kfree(idmap); } -int nfs_idmap_init(void) -{ - int ret; - ret = nfs_idmap_init_keyring(); - if (ret != 0) - goto out; -out: - return ret; -} - -void nfs_idmap_quit(void) -{ - nfs_idmap_quit_keyring(); -} - static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap, struct idmap_msg *im, struct rpc_pipe_msg *msg) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 55e1e3af23a3..693b903b48bd 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -356,6 +356,9 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_ case 0: return 0; case -NFS4ERR_OPENMODE: + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: if (inode && nfs4_have_delegation(inode, FMODE_READ)) { nfs4_inode_return_delegation(inode); exception->retry = 1; @@ -367,15 +370,6 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_ if (ret < 0) break; goto wait_on_recovery; - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_BAD_STATEID: - if (state == NULL) - break; - ret = nfs4_schedule_stateid_recovery(server, state); - if (ret < 0) - break; - goto wait_on_recovery; case -NFS4ERR_EXPIRED: if (state != NULL) { ret = nfs4_schedule_stateid_recovery(server, state); @@ -473,7 +467,10 @@ static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp) static void renew_lease(const struct nfs_server *server, unsigned long timestamp) { - do_renew_lease(server->nfs_client, timestamp); + struct nfs_client *clp = server->nfs_client; + + if (!nfs4_has_session(clp)) + do_renew_lease(clp, timestamp); } struct nfs4_call_sync_data { @@ -482,8 +479,8 @@ struct nfs4_call_sync_data { struct nfs4_sequence_res *seq_res; }; -static void nfs4_init_sequence(struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, int cache_reply) +void nfs4_init_sequence(struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, int cache_reply) { args->sa_slot = NULL; args->sa_cache_this = cache_reply; @@ -589,7 +586,7 @@ out_unlock: spin_unlock(&tbl->slot_tbl_lock); res->sr_slot = NULL; if (send_new_highest_used_slotid) - nfs41_server_notify_highest_slotid_update(session->clp); + nfs41_notify_server(session->clp); } int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) @@ -622,8 +619,7 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) clp = session->clp; do_renew_lease(clp, res->sr_timestamp); /* Check sequence flags */ - if (res->sr_status_flags != 0) - nfs4_schedule_lease_recovery(clp); + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); nfs41_update_target_slotid(slot->table, slot, res); break; case 1: @@ -916,6 +912,7 @@ struct nfs4_opendata { struct nfs_open_confirmres c_res; struct nfs4_string owner_name; struct nfs4_string group_name; + struct nfs4_label *a_label; struct nfs_fattr f_attr; struct nfs4_label *f_label; struct dentry *dir; @@ -1019,6 +1016,10 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, if (IS_ERR(p->f_label)) goto err_free_p; + p->a_label = nfs4_label_alloc(server, gfp_mask); + if (IS_ERR(p->a_label)) + goto err_free_f; + alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid; p->o_arg.seqid = alloc_seqid(&sp->so_seqid, gfp_mask); if (IS_ERR(p->o_arg.seqid)) @@ -1047,7 +1048,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.server = server; p->o_arg.bitmask = nfs4_bitmask(server, label); p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0]; - p->o_arg.label = label; + p->o_arg.label = nfs4_label_copy(p->a_label, label); p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim); switch (p->o_arg.claim) { case NFS4_OPEN_CLAIM_NULL: @@ -1080,6 +1081,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, return p; err_free_label: + nfs4_label_free(p->a_label); +err_free_f: nfs4_label_free(p->f_label); err_free_p: kfree(p); @@ -1099,6 +1102,7 @@ static void nfs4_opendata_free(struct kref *kref) nfs4_put_open_state(p->state); nfs4_put_state_owner(p->owner); + nfs4_label_free(p->a_label); nfs4_label_free(p->f_label); dput(p->dir); @@ -1146,7 +1150,8 @@ out: return ret; } -static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode) +static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode, + enum open_claim_type4 claim) { if (delegation == NULL) return 0; @@ -1154,6 +1159,16 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode) return 0; if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) return 0; + switch (claim) { + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_FH: + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + if (!test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags)) + break; + default: + return 0; + } nfs_mark_delegation_referenced(delegation); return 1; } @@ -1204,15 +1219,19 @@ static bool nfs_need_update_open_stateid(struct nfs4_state *state, static void nfs_resync_open_stateid_locked(struct nfs4_state *state) { + if (!(state->n_wronly || state->n_rdonly || state->n_rdwr)) + return; if (state->n_wronly) set_bit(NFS_O_WRONLY_STATE, &state->flags); if (state->n_rdonly) set_bit(NFS_O_RDONLY_STATE, &state->flags); if (state->n_rdwr) set_bit(NFS_O_RDWR_STATE, &state->flags); + set_bit(NFS_OPEN_STATE, &state->flags); } static void nfs_clear_open_stateid_locked(struct nfs4_state *state, + nfs4_stateid *arg_stateid, nfs4_stateid *stateid, fmode_t fmode) { clear_bit(NFS_O_RDWR_STATE, &state->flags); @@ -1231,8 +1250,9 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state, if (stateid == NULL) return; /* Handle races with OPEN */ - if (!nfs4_stateid_match_other(stateid, &state->open_stateid) || - !nfs4_stateid_is_newer(stateid, &state->open_stateid)) { + if (!nfs4_stateid_match_other(arg_stateid, &state->open_stateid) || + (nfs4_stateid_match_other(stateid, &state->open_stateid) && + !nfs4_stateid_is_newer(stateid, &state->open_stateid))) { nfs_resync_open_stateid_locked(state); return; } @@ -1241,10 +1261,12 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state, nfs4_stateid_copy(&state->open_stateid, stateid); } -static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +static void nfs_clear_open_stateid(struct nfs4_state *state, + nfs4_stateid *arg_stateid, + nfs4_stateid *stateid, fmode_t fmode) { write_seqlock(&state->seqlock); - nfs_clear_open_stateid_locked(state, stateid, fmode); + nfs_clear_open_stateid_locked(state, arg_stateid, stateid, fmode); write_sequnlock(&state->seqlock); if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) nfs4_schedule_state_manager(state->owner->so_server->nfs_client); @@ -1369,6 +1391,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) struct nfs_delegation *delegation; int open_mode = opendata->o_arg.open_flags; fmode_t fmode = opendata->o_arg.fmode; + enum open_claim_type4 claim = opendata->o_arg.claim; nfs4_stateid stateid; int ret = -EAGAIN; @@ -1382,7 +1405,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) spin_unlock(&state->owner->so_lock); rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); - if (!can_open_delegated(delegation, fmode)) { + if (!can_open_delegated(delegation, fmode, claim)) { rcu_read_unlock(); break; } @@ -1553,6 +1576,13 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod struct nfs4_state *newstate; int ret; + if ((opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR || + opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEG_CUR_FH) && + (opendata->o_arg.u.delegation_type & fmode) != fmode) + /* This mode can't have been delegated, so we must have + * a valid open_stateid to cover it - not need to reclaim. + */ + return 0; opendata->o_arg.open_flags = 0; opendata->o_arg.fmode = fmode; opendata->o_arg.share_access = nfs4_map_atomic_open_share( @@ -1684,6 +1714,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct "%d.\n", __func__, err); case 0: case -ENOENT: + case -EAGAIN: case -ESTALE: break; case -NFS4ERR_BADSESSION: @@ -1837,6 +1868,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) struct nfs4_opendata *data = calldata; struct nfs4_state_owner *sp = data->owner; struct nfs_client *clp = sp->so_server->nfs_client; + enum open_claim_type4 claim = data->o_arg.claim; if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0) goto out_wait; @@ -1851,15 +1883,15 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) goto out_no_action; rcu_read_lock(); delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); - if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR && - data->o_arg.claim != NFS4_OPEN_CLAIM_DELEG_CUR_FH && - can_open_delegated(delegation, data->o_arg.fmode)) + if (can_open_delegated(delegation, data->o_arg.fmode, claim)) goto unlock_no_action; rcu_read_unlock(); } /* Update client id. */ data->o_arg.clientid = clp->cl_clientid; - switch (data->o_arg.claim) { + switch (claim) { + default: + break; case NFS4_OPEN_CLAIM_PREVIOUS: case NFS4_OPEN_CLAIM_DELEG_CUR_FH: case NFS4_OPEN_CLAIM_DELEG_PREV_FH: @@ -2279,15 +2311,25 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st * fields corresponding to attributes that were used to store the verifier. * Make sure we clobber those fields in the later setattr call */ -static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr) +static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, + struct iattr *sattr, struct nfs4_label **label) { - if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_ACCESS) && + const u32 *attrset = opendata->o_res.attrset; + + if ((attrset[1] & FATTR4_WORD1_TIME_ACCESS) && !(sattr->ia_valid & ATTR_ATIME_SET)) sattr->ia_valid |= ATTR_ATIME; - if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_MODIFY) && + if ((attrset[1] & FATTR4_WORD1_TIME_MODIFY) && !(sattr->ia_valid & ATTR_MTIME_SET)) sattr->ia_valid |= ATTR_MTIME; + + /* Except MODE, it seems harmless of setting twice. */ + if ((attrset[1] & FATTR4_WORD1_MODE)) + sattr->ia_valid &= ~ATTR_MODE; + + if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL) + *label = NULL; } static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, @@ -2410,9 +2452,9 @@ static int _nfs4_do_open(struct inode *dir, goto err_free_label; state = ctx->state; - if ((opendata->o_arg.open_flags & O_EXCL) && + if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) && (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { - nfs4_exclusive_attrset(opendata, sattr); + nfs4_exclusive_attrset(opendata, sattr, &label); nfs_fattr_init(opendata->o_res.f_attr); status = nfs4_do_setattr(state->inode, cred, @@ -2424,7 +2466,7 @@ static int _nfs4_do_open(struct inode *dir, nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); } } - if (opendata->file_created) + if (opened && opendata->file_created) *opened |= FILE_CREATED; if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) { @@ -2646,7 +2688,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) switch (task->tk_status) { case 0: res_stateid = &calldata->res.stateid; - if (calldata->arg.fmode == 0 && calldata->roc) + if (calldata->roc) pnfs_roc_set_barrier(state->inode, calldata->roc_barrier); renew_lease(server, calldata->timestamp); @@ -2669,7 +2711,8 @@ static void nfs4_close_done(struct rpc_task *task, void *data) goto out_release; } } - nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode); + nfs_clear_open_stateid(state, &calldata->arg.stateid, + res_stateid, calldata->arg.fmode); out_release: nfs_release_seqid(calldata->arg.seqid); nfs_refresh_inode(calldata->inode, calldata->res.fattr); @@ -2720,14 +2763,11 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) goto out_no_action; } - if (calldata->arg.fmode == 0) { + if (calldata->arg.fmode == 0) task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; - if (calldata->roc && - pnfs_roc_drain(inode, &calldata->roc_barrier, task)) { - nfs_release_seqid(calldata->arg.seqid); - goto out_wait; - } - } + if (calldata->roc) + pnfs_roc_get_barrier(inode, &calldata->roc_barrier); + calldata->arg.share_access = nfs4_map_atomic_open_share(NFS_SERVER(inode), calldata->arg.fmode, 0); @@ -2868,8 +2908,10 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) { + u32 bitmask[3] = {}, minorversion = server->nfs_client->cl_minorversion; struct nfs4_server_caps_arg args = { .fhandle = fhandle, + .bitmask = bitmask, }; struct nfs4_server_caps_res res = {}; struct rpc_message msg = { @@ -2879,10 +2921,18 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f }; int status; + bitmask[0] = FATTR4_WORD0_SUPPORTED_ATTRS | + FATTR4_WORD0_FH_EXPIRE_TYPE | + FATTR4_WORD0_LINK_SUPPORT | + FATTR4_WORD0_SYMLINK_SUPPORT | + FATTR4_WORD0_ACLSUPPORT; + if (minorversion) + bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT; + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == 0) { /* Sanity check the server answers */ - switch (server->nfs_client->cl_minorversion) { + switch (minorversion) { case 0: res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK; res.attr_bitmask[2] = 0; @@ -2935,6 +2985,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; server->cache_consistency_bitmask[2] = 0; + memcpy(server->exclcreat_bitmask, res.exclcreat_bitmask, + sizeof(server->exclcreat_bitmask)); server->acl_bitmask = res.acl_bitmask; server->fh_expire_type = res.fh_expire_type; } @@ -3355,6 +3407,8 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, goto out; case -NFS4ERR_MOVED: err = nfs4_get_referral(client, dir, name, fattr, fhandle); + if (err == -NFS4ERR_MOVED) + err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); goto out; case -NFS4ERR_WRONGSEC: err = -EPERM; @@ -3535,7 +3589,6 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, struct nfs4_label l, *ilabel = NULL; struct nfs_open_context *ctx; struct nfs4_state *state; - int opened = 0; int status = 0; ctx = alloc_nfs_open_context(dentry, FMODE_READ); @@ -3545,7 +3598,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, ilabel = nfs4_label_init_security(dir, dentry, sattr, &l); sattr->ia_mode &= ~current_umask(); - state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, &opened); + state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, NULL); if (IS_ERR(state)) { status = PTR_ERR(state); goto out; @@ -4955,49 +5008,111 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp, memcpy(bootverf->data, verf, sizeof(bootverf->data)); } -static unsigned int -nfs4_init_nonuniform_client_string(struct nfs_client *clp, - char *buf, size_t len) +static int +nfs4_init_nonuniform_client_string(struct nfs_client *clp) { - unsigned int result; + int result; + size_t len; + char *str; if (clp->cl_owner_id != NULL) - return strlcpy(buf, clp->cl_owner_id, len); + return 0; rcu_read_lock(); - result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s", - clp->cl_ipaddr, - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR), - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_PROTO)); + len = 14 + strlen(clp->cl_ipaddr) + 1 + + strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) + + 1 + + strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) + + 1; rcu_read_unlock(); - clp->cl_owner_id = kstrdup(buf, GFP_KERNEL); - return result; + + if (len > NFS4_OPAQUE_LIMIT + 1) + return -EINVAL; + + /* + * Since this string is allocated at mount time, and held until the + * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying + * about a memory-reclaim deadlock. + */ + str = kmalloc(len, GFP_KERNEL); + if (!str) + return -ENOMEM; + + rcu_read_lock(); + result = scnprintf(str, len, "Linux NFSv4.0 %s/%s %s", + clp->cl_ipaddr, + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR), + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)); + rcu_read_unlock(); + + clp->cl_owner_id = str; + return 0; } -static unsigned int -nfs4_init_uniform_client_string(struct nfs_client *clp, - char *buf, size_t len) +static int +nfs4_init_uniquifier_client_string(struct nfs_client *clp) +{ + int result; + size_t len; + char *str; + + len = 10 + 10 + 1 + 10 + 1 + + strlen(nfs4_client_id_uniquifier) + 1 + + strlen(clp->cl_rpcclient->cl_nodename) + 1; + + if (len > NFS4_OPAQUE_LIMIT + 1) + return -EINVAL; + + /* + * Since this string is allocated at mount time, and held until the + * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying + * about a memory-reclaim deadlock. + */ + str = kmalloc(len, GFP_KERNEL); + if (!str) + return -ENOMEM; + + result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s", + clp->rpc_ops->version, clp->cl_minorversion, + nfs4_client_id_uniquifier, + clp->cl_rpcclient->cl_nodename); + clp->cl_owner_id = str; + return 0; +} + +static int +nfs4_init_uniform_client_string(struct nfs_client *clp) { - const char *nodename = clp->cl_rpcclient->cl_nodename; - unsigned int result; + int result; + size_t len; + char *str; if (clp->cl_owner_id != NULL) - return strlcpy(buf, clp->cl_owner_id, len); + return 0; if (nfs4_client_id_uniquifier[0] != '\0') - result = scnprintf(buf, len, "Linux NFSv%u.%u %s/%s", - clp->rpc_ops->version, - clp->cl_minorversion, - nfs4_client_id_uniquifier, - nodename); - else - result = scnprintf(buf, len, "Linux NFSv%u.%u %s", - clp->rpc_ops->version, clp->cl_minorversion, - nodename); - clp->cl_owner_id = kstrdup(buf, GFP_KERNEL); - return result; + return nfs4_init_uniquifier_client_string(clp); + + len = 10 + 10 + 1 + 10 + 1 + + strlen(clp->cl_rpcclient->cl_nodename) + 1; + + if (len > NFS4_OPAQUE_LIMIT + 1) + return -EINVAL; + + /* + * Since this string is allocated at mount time, and held until the + * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying + * about a memory-reclaim deadlock. + */ + str = kmalloc(len, GFP_KERNEL); + if (!str) + return -ENOMEM; + + result = scnprintf(str, len, "Linux NFSv%u.%u %s", + clp->rpc_ops->version, clp->cl_minorversion, + clp->cl_rpcclient->cl_nodename); + clp->cl_owner_id = str; + return 0; } /* @@ -5044,7 +5159,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, struct nfs4_setclientid setclientid = { .sc_verifier = &sc_verifier, .sc_prog = program, - .sc_cb_ident = clp->cl_cb_ident, + .sc_clnt = clp, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], @@ -5064,16 +5179,15 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, /* nfs_client_id4 */ nfs4_init_boot_verifier(clp, &sc_verifier); + if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags)) - setclientid.sc_name_len = - nfs4_init_uniform_client_string(clp, - setclientid.sc_name, - sizeof(setclientid.sc_name)); + status = nfs4_init_uniform_client_string(clp); else - setclientid.sc_name_len = - nfs4_init_nonuniform_client_string(clp, - setclientid.sc_name, - sizeof(setclientid.sc_name)); + status = nfs4_init_nonuniform_client_string(clp); + + if (status) + goto out; + /* cb_client4 */ setclientid.sc_netid_len = nfs4_init_callback_netid(clp, @@ -5083,9 +5197,9 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, sizeof(setclientid.sc_uaddr), "%s.%u.%u", clp->cl_ipaddr, port >> 8, port & 255); - dprintk("NFS call setclientid auth=%s, '%.*s'\n", + dprintk("NFS call setclientid auth=%s, '%s'\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, - setclientid.sc_name_len, setclientid.sc_name); + clp->cl_owner_id); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) { status = PTR_ERR(task); @@ -5194,9 +5308,8 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) d_data = (struct nfs4_delegreturndata *)data; - if (d_data->roc && - pnfs_roc_drain(d_data->inode, &d_data->roc_barrier, task)) - return; + if (d_data->roc) + pnfs_roc_get_barrier(d_data->inode, &d_data->roc_barrier); nfs4_setup_sequence(d_data->res.server, &d_data->args.seq_args, @@ -5357,15 +5470,15 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock * return err; } -static int do_vfs_lock(struct file *file, struct file_lock *fl) +static int do_vfs_lock(struct inode *inode, struct file_lock *fl) { int res = 0; switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { case FL_POSIX: - res = posix_lock_file_wait(file, fl); + res = posix_lock_inode_wait(inode, fl); break; case FL_FLOCK: - res = flock_lock_file_wait(file, fl); + res = flock_lock_inode_wait(inode, fl); break; default: BUG(); @@ -5425,7 +5538,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) switch (task->tk_status) { case 0: renew_lease(calldata->server, calldata->timestamp); - do_vfs_lock(calldata->fl.fl_file, &calldata->fl); + do_vfs_lock(calldata->lsp->ls_state->inode, &calldata->fl); if (nfs4_update_lock_stateid(calldata->lsp, &calldata->res.stateid)) break; @@ -5533,7 +5646,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * mutex_lock(&sp->so_delegreturn_mutex); /* Exclude nfs4_reclaim_open_stateid() - note nesting! */ down_read(&nfsi->rwsem); - if (do_vfs_lock(request->fl_file, request) == -ENOENT) { + if (do_vfs_lock(inode, request) == -ENOENT) { up_read(&nfsi->rwsem); mutex_unlock(&sp->so_delegreturn_mutex); goto out; @@ -5674,7 +5787,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) data->timestamp); if (data->arg.new_lock) { data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); - if (do_vfs_lock(data->fl.fl_file, &data->fl) < 0) { + if (do_vfs_lock(lsp->ls_state->inode, &data->fl) < 0) { rpc_restart_call_prepare(task); break; } @@ -5916,7 +6029,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock if (status != 0) goto out; request->fl_flags |= FL_ACCESS; - status = do_vfs_lock(request->fl_file, request); + status = do_vfs_lock(state->inode, request); if (status < 0) goto out; down_read(&nfsi->rwsem); @@ -5924,7 +6037,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock /* Yes: cache locks! */ /* ...but avoid races with delegation recall... */ request->fl_flags = fl_flags & ~FL_SLEEP; - status = do_vfs_lock(request->fl_file, request); + status = do_vfs_lock(state->inode, request); up_read(&nfsi->rwsem); goto out; } @@ -6846,11 +6959,14 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, }; nfs4_init_boot_verifier(clp, &verifier); - args.id_len = nfs4_init_uniform_client_string(clp, args.id, - sizeof(args.id)); - dprintk("NFS call exchange_id auth=%s, '%.*s'\n", + + status = nfs4_init_uniform_client_string(clp); + if (status) + goto out; + + dprintk("NFS call exchange_id auth=%s, '%s'\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, - args.id_len, args.id); + clp->cl_owner_id); res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), GFP_NOFS); @@ -6885,7 +7001,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, /* unsupported! */ WARN_ON_ONCE(1); status = -EINVAL; - goto out_server_scope; + goto out_impl_id; } status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); @@ -6913,6 +7029,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, /* use the most recent implementation id */ kfree(clp->cl_implid); clp->cl_implid = res.impl_id; + res.impl_id = NULL; if (clp->cl_serverscope != NULL && !nfs41_same_server_scope(clp->cl_serverscope, @@ -6926,15 +7043,16 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, if (clp->cl_serverscope == NULL) { clp->cl_serverscope = res.server_scope; - goto out; + res.server_scope = NULL; } - } else - kfree(res.impl_id); + } -out_server_owner: - kfree(res.server_owner); +out_impl_id: + kfree(res.impl_id); out_server_scope: kfree(res.server_scope); +out_server_owner: + kfree(res.server_owner); out: if (clp->cl_implid != NULL) dprintk("NFS reply exchange_id: Server Implementation ID: " @@ -7484,13 +7602,8 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) goto out; } ret = rpc_wait_for_completion_task(task); - if (!ret) { - struct nfs4_sequence_res *res = task->tk_msg.rpc_resp; - - if (task->tk_status == 0) - nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); + if (!ret) ret = task->tk_status; - } rpc_put_task(task); out: dprintk("<-- %s status=%d\n", __func__, ret); @@ -7651,10 +7764,19 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) case 0: goto out; /* + * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of + * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3). + */ + case -NFS4ERR_BADLAYOUT: + goto out_overflow; + /* * NFS4ERR_LAYOUTTRYLATER is a conflict with another client - * (or clients) writing to the same RAID stripe + * (or clients) writing to the same RAID stripe except when + * the minlength argument is 0 (see RFC5661 section 18.43.3). */ case -NFS4ERR_LAYOUTTRYLATER: + if (lgp->args.minlength == 0) + goto out_overflow; /* * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall * existing layout before getting a new one). @@ -7710,6 +7832,10 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) rpc_restart_call_prepare(task); out: dprintk("<-- %s\n", __func__); + return; +out_overflow: + task->tk_status = -EOVERFLOW; + goto out; } static size_t max_response_pages(struct nfs_server *server) @@ -7878,16 +8004,17 @@ static void nfs4_layoutreturn_release(void *calldata) { struct nfs4_layoutreturn *lrp = calldata; struct pnfs_layout_hdr *lo = lrp->args.layout; + LIST_HEAD(freeme); dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); if (lrp->res.lrs_present) pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); + pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range); pnfs_clear_layoutreturn_waitbit(lo); - clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); - rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); lo->plh_block_lgets--; spin_unlock(&lo->plh_inode->i_lock); + pnfs_free_lseg_list(&freeme); pnfs_put_layout_hdr(lrp->args.layout); nfs_iput_and_deactive(lrp->inode); kfree(calldata); @@ -8061,9 +8188,8 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) struct rpc_task *task; int status = 0; - dprintk("NFS: %4d initiating layoutcommit call. sync %d " - "lbw: %llu inode %lu\n", - data->task.tk_pid, sync, + dprintk("NFS: initiating layoutcommit call. sync %d " + "lbw: %llu inode %lu\n", sync, data->args.lastbytewritten, data->args.inode->i_ino); @@ -8502,7 +8628,6 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .minor_version = 0, .init_caps = NFS_CAP_READDIRPLUS | NFS_CAP_ATOMIC_OPEN - | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK, .init_client = nfs40_init_client, .shutdown_client = nfs40_shutdown_client, @@ -8528,7 +8653,6 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .minor_version = 1, .init_caps = NFS_CAP_READDIRPLUS | NFS_CAP_ATOMIC_OPEN - | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1, @@ -8551,13 +8675,13 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { .minor_version = 2, .init_caps = NFS_CAP_READDIRPLUS | NFS_CAP_ATOMIC_OPEN - | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1 | NFS_CAP_ALLOCATE | NFS_CAP_DEALLOCATE - | NFS_CAP_SEEK, + | NFS_CAP_SEEK + | NFS_CAP_LAYOUTSTATS, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, @@ -8568,6 +8692,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, .state_renewal_ops = &nfs41_state_renewal_ops, + .mig_recovery_ops = &nfs41_mig_recovery_ops, }; #endif diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 2782cfca2265..da73bc443238 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -309,7 +309,6 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state)) goto do_confirm; - nfs4_begin_drain_session(clp); status = nfs4_proc_exchange_id(clp, cred); if (status != 0) goto out; @@ -1482,6 +1481,8 @@ restart: spin_unlock(&state->state_lock); } nfs4_put_open_state(state); + clear_bit(NFS4CLNT_RECLAIM_NOGRACE, + &state->flags); spin_lock(&sp->so_lock); goto restart; } @@ -1830,6 +1831,7 @@ static int nfs4_establish_lease(struct nfs_client *clp) clp->cl_mvops->reboot_recovery_ops; int status; + nfs4_begin_drain_session(clp); cred = nfs4_get_clid_cred(clp); if (cred == NULL) return -ENOENT; @@ -2150,23 +2152,13 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) } EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); -static void nfs41_ping_server(struct nfs_client *clp) +void nfs41_notify_server(struct nfs_client *clp) { /* Use CHECK_LEASE to ping the server with a SEQUENCE */ set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); nfs4_schedule_state_manager(clp); } -void nfs41_server_notify_target_slotid_update(struct nfs_client *clp) -{ - nfs41_ping_server(clp); -} - -void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp) -{ - nfs41_ping_server(clp); -} - static void nfs4_reset_all_state(struct nfs_client *clp) { if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { @@ -2189,25 +2181,35 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp) } } -static void nfs41_handle_state_revoked(struct nfs_client *clp) +static void nfs41_handle_all_state_revoked(struct nfs_client *clp) { nfs4_reset_all_state(clp); dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname); } +static void nfs41_handle_some_state_revoked(struct nfs_client *clp) +{ + nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); + nfs4_schedule_state_manager(clp); + + dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname); +} + static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp) { - /* This will need to handle layouts too */ - nfs_expire_all_delegations(clp); + /* FIXME: For now, we destroy all layouts. */ + pnfs_destroy_all_layouts(clp); + /* FIXME: For now, we test all delegations+open state+locks. */ + nfs41_handle_some_state_revoked(clp); dprintk("%s: Recallable state revoked on server %s!\n", __func__, clp->cl_hostname); } static void nfs41_handle_backchannel_fault(struct nfs_client *clp) { - nfs_expire_all_delegations(clp); - if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0) - nfs4_schedule_state_manager(clp); + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + nfs4_schedule_state_manager(clp); + dprintk("%s: server %s declared a backchannel fault\n", __func__, clp->cl_hostname); } @@ -2229,10 +2231,11 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) nfs41_handle_server_reboot(clp); - if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | - SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | + if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED)) + nfs41_handle_all_state_revoked(clp); + if (flags & (SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | SEQ4_STATUS_ADMIN_STATE_REVOKED)) - nfs41_handle_state_revoked(clp); + nfs41_handle_some_state_revoked(clp); if (flags & SEQ4_STATUS_LEASE_MOVED) nfs4_schedule_lease_moved_recovery(clp); if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 470af1a78bec..28df12e525ba 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -884,6 +884,66 @@ DEFINE_NFS4_GETATTR_EVENT(nfs4_getattr); DEFINE_NFS4_GETATTR_EVENT(nfs4_lookup_root); DEFINE_NFS4_GETATTR_EVENT(nfs4_fsinfo); +DECLARE_EVENT_CLASS(nfs4_inode_callback_event, + TP_PROTO( + const struct nfs_client *clp, + const struct nfs_fh *fhandle, + const struct inode *inode, + int error + ), + + TP_ARGS(clp, fhandle, inode, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __string(dstaddr, clp ? + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR) : "unknown") + ), + + TP_fast_assign( + __entry->error = error; + __entry->fhandle = nfs_fhandle_hash(fhandle); + if (inode != NULL) { + __entry->fileid = NFS_FILEID(inode); + __entry->dev = inode->i_sb->s_dev; + } else { + __entry->fileid = 0; + __entry->dev = 0; + } + __assign_str(dstaddr, clp ? + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR) : "unknown") + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "dstaddr=%s", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __get_str(dstaddr) + ) +); + +#define DEFINE_NFS4_INODE_CALLBACK_EVENT(name) \ + DEFINE_EVENT(nfs4_inode_callback_event, name, \ + TP_PROTO( \ + const struct nfs_client *clp, \ + const struct nfs_fh *fhandle, \ + const struct inode *inode, \ + int error \ + ), \ + TP_ARGS(clp, fhandle, inode, error)) +DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr); +DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_layoutrecall_inode); + + DECLARE_EVENT_CLASS(nfs4_idmap_event, TP_PROTO( const char *name, @@ -1136,6 +1196,7 @@ TRACE_EVENT(nfs4_layoutget, DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit); DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn); +DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close); #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 0aea97841d30..788adf3897c7 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -139,7 +139,8 @@ static int nfs4_stat_to_errno(int); #define encode_setclientid_maxsz \ (op_encode_hdr_maxsz + \ XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \ - XDR_QUADLEN(NFS4_SETCLIENTID_NAMELEN) + \ + /* client name */ \ + 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ 1 /* sc_prog */ + \ 1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \ 1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \ @@ -288,7 +289,8 @@ static int nfs4_stat_to_errno(int); #define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \ encode_verifier_maxsz + \ 1 /* co_ownerid.len */ + \ - XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \ + /* eia_clientowner */ \ + 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ 1 /* flags */ + \ 1 /* spa_how */ + \ /* max is SP4_MACH_CRED (for now) */ + \ @@ -398,7 +400,8 @@ static int nfs4_stat_to_errno(int); #define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) #define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ encode_stateid_maxsz + \ - 1 /* FIXME: opaque lrf_body always empty at the moment */) + 1 + \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT)) #define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ 1 + decode_stateid_maxsz) #define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1) @@ -999,7 +1002,8 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs4_label *label, - const struct nfs_server *server) + const struct nfs_server *server, + bool excl_check) { char owner_name[IDMAP_NAMESZ]; char owner_group[IDMAP_NAMESZ]; @@ -1065,6 +1069,17 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; len += 4; } + + if (excl_check) { + const u32 *excl_bmval = server->exclcreat_bitmask; + bmval[0] &= excl_bmval[0]; + bmval[1] &= excl_bmval[1]; + bmval[2] &= excl_bmval[2]; + + if (!(excl_bmval[2] & FATTR4_WORD2_SECURITY_LABEL)) + label = NULL; + } + if (label) { len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2); bmval[2] |= FATTR4_WORD2_SECURITY_LABEL; @@ -1152,7 +1167,9 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * case NF4LNK: p = reserve_space(xdr, 4); *p = cpu_to_be32(create->u.symlink.len); - xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len); + xdr_write_pages(xdr, create->u.symlink.pages, 0, + create->u.symlink.len); + xdr->buf->flags |= XDRBUF_WRITE; break; case NF4BLK: case NF4CHR: @@ -1166,7 +1183,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * } encode_string(xdr, create->name->len, create->name->name); - encode_attrs(xdr, create->attrs, create->label, create->server); + encode_attrs(xdr, create->attrs, create->label, create->server, false); } static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) @@ -1380,18 +1397,17 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) { - struct iattr dummy; __be32 *p; p = reserve_space(xdr, 4); switch(arg->createmode) { case NFS4_CREATE_UNCHECKED: *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false); break; case NFS4_CREATE_GUARDED: *p = cpu_to_be32(NFS4_CREATE_GUARDED); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false); break; case NFS4_CREATE_EXCLUSIVE: *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); @@ -1400,8 +1416,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op case NFS4_CREATE_EXCLUSIVE4_1: *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); encode_nfs4_verifier(xdr, &arg->u.verifier); - dummy.ia_valid = 0; - encode_attrs(xdr, &dummy, arg->label, arg->server); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true); } } @@ -1657,7 +1672,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs { encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr); encode_nfs4_stateid(xdr, &arg->stateid); - encode_attrs(xdr, arg->iap, arg->label, server); + encode_attrs(xdr, arg->iap, arg->label, server, false); } static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr) @@ -1667,13 +1682,14 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr); encode_nfs4_verifier(xdr, setclientid->sc_verifier); - encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); + encode_string(xdr, strlen(setclientid->sc_clnt->cl_owner_id), + setclientid->sc_clnt->cl_owner_id); p = reserve_space(xdr, 4); *p = cpu_to_be32(setclientid->sc_prog); encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid); encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); p = reserve_space(xdr, 4); - *p = cpu_to_be32(setclientid->sc_cb_ident); + *p = cpu_to_be32(setclientid->sc_clnt->cl_cb_ident); } static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr) @@ -1747,7 +1763,8 @@ static void encode_exchange_id(struct xdr_stream *xdr, encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr); encode_nfs4_verifier(xdr, args->verifier); - encode_string(xdr, args->id_len, args->id); + encode_string(xdr, strlen(args->client->cl_owner_id), + args->client->cl_owner_id); encode_uint32(xdr, args->flags); encode_uint32(xdr, args->state_protect.how); @@ -2576,6 +2593,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req, struct xdr_stream *xdr, struct nfs4_server_caps_arg *args) { + const u32 *bitmask = args->bitmask; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; @@ -2583,11 +2601,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fhandle, &hdr); - encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS| - FATTR4_WORD0_FH_EXPIRE_TYPE| - FATTR4_WORD0_LINK_SUPPORT| - FATTR4_WORD0_SYMLINK_SUPPORT| - FATTR4_WORD0_ACLSUPPORT, &hdr); + encode_getattr_three(xdr, bitmask[0], bitmask[1], bitmask[2], &hdr); encode_nops(&hdr); } @@ -3364,6 +3378,22 @@ out_overflow: return -EIO; } +static int decode_attr_exclcreat_supported(struct xdr_stream *xdr, + uint32_t *bitmap, uint32_t *bitmask) +{ + if (likely(bitmap[2] & FATTR4_WORD2_SUPPATTR_EXCLCREAT)) { + int ret; + ret = decode_attr_bitmap(xdr, bitmask); + if (unlikely(ret < 0)) + return ret; + bitmap[2] &= ~FATTR4_WORD2_SUPPATTR_EXCLCREAT; + } else + bitmask[0] = bitmask[1] = bitmask[2] = 0; + dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, + bitmask[0], bitmask[1], bitmask[2]); + return 0; +} + static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh) { __be32 *p; @@ -4317,6 +4347,9 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re goto xdr_error; if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0) goto xdr_error; + if ((status = decode_attr_exclcreat_supported(xdr, bitmap, + res->exclcreat_bitmask)) != 0) + goto xdr_error; status = verify_attr_len(xdr, savep, attrlen); xdr_error: dprintk("%s: xdr returned %d!\n", __func__, -status); @@ -4899,24 +4932,28 @@ static int decode_lookup(struct xdr_stream *xdr) } /* This is too sick! */ -static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize) +static int decode_space_limit(struct xdr_stream *xdr, + unsigned long *pagemod_limit) { __be32 *p; uint32_t limit_type, nblocks, blocksize; + u64 maxsize = 0; p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) goto out_overflow; limit_type = be32_to_cpup(p++); switch (limit_type) { - case 1: - xdr_decode_hyper(p, maxsize); + case NFS4_LIMIT_SIZE: + xdr_decode_hyper(p, &maxsize); break; - case 2: + case NFS4_LIMIT_BLOCKS: nblocks = be32_to_cpup(p++); blocksize = be32_to_cpup(p); - *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; + maxsize = (uint64_t)nblocks * (uint64_t)blocksize; } + maxsize >>= PAGE_CACHE_SHIFT; + *pagemod_limit = min_t(u64, maxsize, ULONG_MAX); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -4944,7 +4981,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr, break; case NFS4_OPEN_DELEGATE_WRITE: res->delegation_type = FMODE_WRITE|FMODE_READ; - if (decode_space_limit(xdr, &res->maxsize) < 0) + if (decode_space_limit(xdr, &res->pagemod_limit) < 0) return -EIO; } return decode_ace(xdr, NULL, res->server->nfs_client); @@ -7427,6 +7464,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(SEEK, enc_seek, dec_seek), PROC(ALLOCATE, enc_allocate, dec_allocate), PROC(DEALLOCATE, enc_deallocate, dec_deallocate), + PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats), #endif /* CONFIG_NFS_V4_2 */ }; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 282b39369510..7c5718ba625e 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -77,8 +77,8 @@ EXPORT_SYMBOL_GPL(nfs_pgheader_init); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) { spin_lock(&hdr->lock); - if (pos < hdr->io_start + hdr->good_bytes) { - set_bit(NFS_IOHDR_ERROR, &hdr->flags); + if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags) + || pos < hdr->io_start + hdr->good_bytes) { clear_bit(NFS_IOHDR_EOF, &hdr->flags); hdr->good_bytes = pos - hdr->io_start; hdr->error = error; @@ -636,9 +636,8 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how); - dprintk("NFS: %5u initiated pgio call " + dprintk("NFS: initiated pgio call " "(req %s/%llu, %u bytes @ offset %llu)\n", - hdr->task.tk_pid, hdr->inode->i_sb->s_id, (unsigned long long)NFS_FILEID(hdr->inode), hdr->args.count, @@ -690,8 +689,6 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, static void nfs_pgio_release(void *calldata) { struct nfs_pgio_header *hdr = calldata; - if (hdr->rw_ops->rw_release) - hdr->rw_ops->rw_release(hdr); nfs_pgio_data_destroy(hdr); hdr->completion_ops->completion(hdr); } @@ -711,7 +708,9 @@ static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror, * nfs_pageio_init - initialise a page io descriptor * @desc: pointer to descriptor * @inode: pointer to inode - * @doio: pointer to io function + * @pg_ops: pointer to pageio operations + * @compl_ops: pointer to pageio completion operations + * @rw_ops: pointer to nfs read/write operations * @bsize: io block size * @io_flags: extra parameters for the io function */ @@ -1101,8 +1100,6 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) mirror->pg_base = 0; mirror->pg_recoalesce = 0; - desc->pg_moreio = 0; - while (!list_empty(&head)) { struct nfs_page *req; @@ -1110,8 +1107,11 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) nfs_list_remove_request(req); if (__nfs_pageio_add_request(desc, req)) continue; - if (desc->pg_error < 0) + if (desc->pg_error < 0) { + list_splice_tail(&head, &mirror->pg_list); + mirror->pg_recoalesce = 1; return 0; + } break; } } while (mirror->pg_recoalesce); @@ -1186,6 +1186,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, * nfs_pageio_complete_mirror - Complete I/O on the current mirror of an * nfs_pageio_descriptor * @desc: pointer to io descriptor + * @mirror_idx: pointer to mirror index */ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc, u32 mirror_idx) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 230606243be6..ba1246433794 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -35,6 +35,7 @@ #include "iostat.h" #include "nfs4trace.h" #include "delegation.h" +#include "nfs42.h" #define NFSDBG_FACILITY NFSDBG_PNFS #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) @@ -351,7 +352,7 @@ pnfs_layout_need_return(struct pnfs_layout_hdr *lo, { struct pnfs_layout_segment *s; - if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + if (!test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) return false; list_for_each_entry(s, &lo->plh_segs, pls_list) @@ -361,6 +362,17 @@ pnfs_layout_need_return(struct pnfs_layout_hdr *lo, return true; } +static bool +pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) +{ + if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + return false; + lo->plh_return_iomode = 0; + pnfs_get_layout_hdr(lo); + clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); + return true; +} + static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg, struct pnfs_layout_hdr *lo, struct inode *inode) { @@ -371,17 +383,16 @@ static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg, if (pnfs_layout_need_return(lo, lseg)) { nfs4_stateid stateid; enum pnfs_iomode iomode; + bool send; stateid = lo->plh_stateid; iomode = lo->plh_return_iomode; - /* decreased in pnfs_send_layoutreturn() */ - lo->plh_block_lgets++; - lo->plh_return_iomode = 0; + send = pnfs_prepare_layoutreturn(lo); spin_unlock(&inode->i_lock); - pnfs_get_layout_hdr(lo); - - /* Send an async layoutreturn so we dont deadlock */ - pnfs_send_layoutreturn(lo, stateid, iomode, false); + if (send) { + /* Send an async layoutreturn so we dont deadlock */ + pnfs_send_layoutreturn(lo, stateid, iomode, false); + } } else spin_unlock(&inode->i_lock); } @@ -410,6 +421,10 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) pnfs_layoutreturn_before_put_lseg(lseg, lo, inode); if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { + if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { + spin_unlock(&inode->i_lock); + return; + } pnfs_get_layout_hdr(lo); pnfs_layout_remove_lseg(lo, lseg); spin_unlock(&inode->i_lock); @@ -450,6 +465,8 @@ pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg) test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); if (atomic_dec_and_test(&lseg->pls_refcount)) { struct pnfs_layout_hdr *lo = lseg->pls_layout; + if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) + return; pnfs_get_layout_hdr(lo); pnfs_layout_remove_lseg(lo, lseg); pnfs_free_lseg_async(lseg); @@ -799,25 +816,12 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo, return !pnfs_seqid_is_newer(seqid, lo->plh_barrier); } -static bool -pnfs_layout_returning(const struct pnfs_layout_hdr *lo, - struct pnfs_layout_range *range) -{ - return test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) && - (lo->plh_return_iomode == IOMODE_ANY || - lo->plh_return_iomode == range->iomode); -} - /* lget is set to 1 if called from inside send_layoutget call chain */ static bool -pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, - struct pnfs_layout_range *range, int lget) +pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo) { return lo->plh_block_lgets || - test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || - (list_empty(&lo->plh_segs) && - (atomic_read(&lo->plh_outstanding) > lget)) || - pnfs_layout_returning(lo, range); + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); } int @@ -829,7 +833,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); - if (pnfs_layoutgets_blocked(lo, range, 1)) { + if (pnfs_layoutgets_blocked(lo)) { status = -EAGAIN; } else if (!nfs4_valid_open_stateid(open_state)) { status = -EBADF; @@ -864,6 +868,7 @@ send_layoutget(struct pnfs_layout_hdr *lo, struct nfs_server *server = NFS_SERVER(ino); struct nfs4_layoutget *lgp; struct pnfs_layout_segment *lseg; + loff_t i_size; dprintk("--> %s\n", __func__); @@ -871,9 +876,17 @@ send_layoutget(struct pnfs_layout_hdr *lo, if (lgp == NULL) return NULL; + i_size = i_size_read(ino); + lgp->args.minlength = PAGE_CACHE_SIZE; if (lgp->args.minlength > range->length) lgp->args.minlength = range->length; + if (range->iomode == IOMODE_READ) { + if (range->offset >= i_size) + lgp->args.minlength = 0; + else if (i_size - range->offset < lgp->args.minlength) + lgp->args.minlength = i_size - range->offset; + } lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; lgp->args.range = *range; lgp->args.type = server->pnfs_curr_ld->id; @@ -923,6 +936,7 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags); smp_mb__after_atomic(); wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN); + rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); } static int @@ -937,9 +951,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, if (unlikely(lrp == NULL)) { status = -ENOMEM; spin_lock(&ino->i_lock); - lo->plh_block_lgets--; pnfs_clear_layoutreturn_waitbit(lo); - rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq); spin_unlock(&ino->i_lock); pnfs_put_layout_hdr(lo); goto out; @@ -977,6 +989,7 @@ _pnfs_return_layout(struct inode *ino) LIST_HEAD(tmp_list); nfs4_stateid stateid; int status = 0, empty; + bool send; dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino); @@ -1006,17 +1019,18 @@ _pnfs_return_layout(struct inode *ino) /* Don't send a LAYOUTRETURN if list was initially empty */ if (empty) { spin_unlock(&ino->i_lock); - pnfs_put_layout_hdr(lo); dprintk("NFS: %s no layout segments to return\n", __func__); - goto out; + goto out_put_layout_hdr; } set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - lo->plh_block_lgets++; + send = pnfs_prepare_layoutreturn(lo); spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); - - status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + if (send) + status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); +out_put_layout_hdr: + pnfs_put_layout_hdr(lo); out: dprintk("<-- %s status: %d\n", __func__, status); return status; @@ -1059,15 +1073,14 @@ bool pnfs_roc(struct inode *ino) struct pnfs_layout_segment *lseg, *tmp; nfs4_stateid stateid; LIST_HEAD(tmp_list); - bool found = false, layoutreturn = false; + bool found = false, layoutreturn = false, roc = false; spin_lock(&ino->i_lock); lo = nfsi->layout; - if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) || - test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) + if (!lo || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) goto out_noroc; - /* Don't return layout if we hold a delegation */ + /* no roc if we hold a delegation */ if (nfs4_check_delegation(ino, FMODE_READ)) goto out_noroc; @@ -1078,38 +1091,41 @@ bool pnfs_roc(struct inode *ino) goto out_noroc; } + stateid = lo->plh_stateid; + /* always send layoutreturn if being marked so */ + if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &lo->plh_flags)) + layoutreturn = pnfs_prepare_layoutreturn(lo); + pnfs_clear_retry_layoutget(lo); list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) - if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { + /* If we are sending layoutreturn, invalidate all valid lsegs */ + if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { mark_lseg_invalid(lseg, &tmp_list); found = true; } - if (!found) - goto out_noroc; - lo->plh_block_lgets++; - pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ - spin_unlock(&ino->i_lock); - pnfs_free_lseg_list(&tmp_list); - pnfs_layoutcommit_inode(ino, true); - return true; + /* pnfs_prepare_layoutreturn() grabs lo ref and it will be put + * in pnfs_roc_release(). We don't really send a layoutreturn but + * still want others to view us like we are sending one! + * + * If pnfs_prepare_layoutreturn() fails, it means someone else is doing + * LAYOUTRETURN, so we proceed like there are no layouts to return. + * + * ROC in three conditions: + * 1. there are ROC lsegs + * 2. we don't send layoutreturn + * 3. no others are sending layoutreturn + */ + if (found && !layoutreturn && pnfs_prepare_layoutreturn(lo)) + roc = true; out_noroc: - if (lo) { - stateid = lo->plh_stateid; - layoutreturn = - test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, - &lo->plh_flags); - if (layoutreturn) { - lo->plh_block_lgets++; - pnfs_get_layout_hdr(lo); - } - } spin_unlock(&ino->i_lock); - if (layoutreturn) { - pnfs_layoutcommit_inode(ino, true); + pnfs_free_lseg_list(&tmp_list); + pnfs_layoutcommit_inode(ino, true); + if (layoutreturn) pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); - } - return false; + return roc; } void pnfs_roc_release(struct inode *ino) @@ -1118,7 +1134,7 @@ void pnfs_roc_release(struct inode *ino) spin_lock(&ino->i_lock); lo = NFS_I(ino)->layout; - lo->plh_block_lgets--; + pnfs_clear_layoutreturn_waitbit(lo); if (atomic_dec_and_test(&lo->plh_refcount)) { pnfs_detach_layout_hdr(lo); spin_unlock(&ino->i_lock); @@ -1136,24 +1152,16 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) lo->plh_barrier = barrier; spin_unlock(&ino->i_lock); + trace_nfs4_layoutreturn_on_close(ino, 0); } -bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) +void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier) { struct nfs_inode *nfsi = NFS_I(ino); struct pnfs_layout_hdr *lo; - struct pnfs_layout_segment *lseg; - nfs4_stateid stateid; u32 current_seqid; - bool found = false, layoutreturn = false; spin_lock(&ino->i_lock); - list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) - if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { - rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); - found = true; - goto out; - } lo = nfsi->layout; current_seqid = be32_to_cpu(lo->plh_stateid.seqid); @@ -1161,23 +1169,7 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) * a barrier, we choose the worst-case barrier. */ *barrier = current_seqid + atomic_read(&lo->plh_outstanding); -out: - if (!found) { - stateid = lo->plh_stateid; - layoutreturn = - test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, - &lo->plh_flags); - if (layoutreturn) { - lo->plh_block_lgets++; - pnfs_get_layout_hdr(lo); - } - } spin_unlock(&ino->i_lock); - if (layoutreturn) { - rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); - pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false); - } - return found; } /* @@ -1205,16 +1197,41 @@ pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1, return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ); } -static void -pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo, - struct pnfs_layout_segment *lseg) +static bool +pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) { - struct pnfs_layout_segment *lp; + return pnfs_lseg_range_cmp(l1, l2) > 0; +} + +static bool +pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg, + struct pnfs_layout_segment *old) +{ + return false; +} + +void +pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg, + bool (*is_after)(const struct pnfs_layout_range *, + const struct pnfs_layout_range *), + bool (*do_merge)(struct pnfs_layout_segment *, + struct pnfs_layout_segment *), + struct list_head *free_me) +{ + struct pnfs_layout_segment *lp, *tmp; dprintk("%s:Begin\n", __func__); - list_for_each_entry(lp, &lo->plh_segs, pls_list) { - if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0) + list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) { + if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0) + continue; + if (do_merge(lseg, lp)) { + mark_lseg_invalid(lp, free_me); + continue; + } + if (is_after(&lseg->pls_range, &lp->pls_range)) continue; list_add_tail(&lseg->pls_list, &lp->pls_list); dprintk("%s: inserted lseg %p " @@ -1236,6 +1253,24 @@ out: dprintk("%s:Return\n", __func__); } +EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg); + +static void +pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg, + struct list_head *free_me) +{ + struct inode *inode = lo->plh_inode; + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + + if (ld->add_lseg != NULL) + ld->add_lseg(lo, lseg, free_me); + else + pnfs_generic_layout_insert_lseg(lo, lseg, + pnfs_lseg_range_is_after, + pnfs_lseg_no_merge, + free_me); +} static struct pnfs_layout_hdr * alloc_init_layout_hdr(struct inode *ino, @@ -1328,8 +1363,6 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, ret = pnfs_get_lseg(lseg); break; } - if (lseg->pls_range.offset > range->offset) - break; } dprintk("%s:Return lseg %p ref %d\n", @@ -1422,6 +1455,8 @@ static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key) static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) { + if (!pnfs_should_retry_layoutget(lo)) + return false; /* * send layoutcommit as it can hold up layoutreturn due to lseg * reference @@ -1468,6 +1503,9 @@ pnfs_update_layout(struct inode *ino, if (!pnfs_enabled_sb(NFS_SERVER(ino))) goto out; + if (iomode == IOMODE_READ && i_size_read(ino) == 0) + goto out; + if (pnfs_within_mdsthreshold(ctx, ino, iomode)) goto out; @@ -1517,8 +1555,7 @@ lookup_again: * Because we free lsegs before sending LAYOUTRETURN, we need to wait * for LAYOUTRETURN even if first is true. */ - if (!lseg && pnfs_should_retry_layoutget(lo) && - test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { + if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { spin_unlock(&ino->i_lock); dprintk("%s wait for layoutreturn\n", __func__); if (pnfs_prepare_to_retry_layoutget(lo)) { @@ -1531,7 +1568,7 @@ lookup_again: goto out_put_layout_hdr; } - if (pnfs_layoutgets_blocked(lo, &arg, 0)) + if (pnfs_layoutgets_blocked(lo)) goto out_unlock; atomic_inc(&lo->plh_outstanding); spin_unlock(&ino->i_lock); @@ -1577,6 +1614,26 @@ out_unlock: } EXPORT_SYMBOL_GPL(pnfs_update_layout); +static bool +pnfs_sanity_check_layout_range(struct pnfs_layout_range *range) +{ + switch (range->iomode) { + case IOMODE_READ: + case IOMODE_RW: + break; + default: + return false; + } + if (range->offset == NFS4_MAX_UINT64) + return false; + if (range->length == 0) + return false; + if (range->length != NFS4_MAX_UINT64 && + range->length > NFS4_MAX_UINT64 - range->offset) + return false; + return true; +} + struct pnfs_layout_segment * pnfs_layout_process(struct nfs4_layoutget *lgp) { @@ -1585,7 +1642,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) struct pnfs_layout_segment *lseg; struct inode *ino = lo->plh_inode; LIST_HEAD(free_me); - int status = 0; + int status = -EINVAL; + + if (!pnfs_sanity_check_layout_range(&res->range)) + goto out; /* Inject layout blob into I/O device driver */ lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); @@ -1603,12 +1663,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) lseg->pls_range = res->range; spin_lock(&ino->i_lock); - if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { - dprintk("%s forget reply due to recall\n", __func__); - goto out_forget_reply; - } - - if (pnfs_layoutgets_blocked(lo, &lgp->args.range, 1)) { + if (pnfs_layoutgets_blocked(lo)) { dprintk("%s forget reply due to state\n", __func__); goto out_forget_reply; } @@ -1635,12 +1690,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); pnfs_get_lseg(lseg); - pnfs_layout_insert_lseg(lo, lseg); + pnfs_layout_insert_lseg(lo, lseg, &free_me); - if (res->return_on_close) { + if (res->return_on_close) set_bit(NFS_LSEG_ROC, &lseg->pls_flags); - set_bit(NFS_LAYOUT_ROC, &lo->plh_flags); - } spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me); @@ -1676,6 +1729,8 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, lseg->pls_range.length); set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); mark_lseg_invalid(lseg, tmp_list); + set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &lo->plh_flags); } } @@ -1694,7 +1749,6 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, spin_lock(&inode->i_lock); /* set failure bit so that pnfs path will be retried later */ pnfs_layout_set_fail_bit(lo, iomode); - set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); if (lo->plh_return_iomode == 0) lo->plh_return_iomode = range.iomode; else if (lo->plh_return_iomode != range.iomode) @@ -1821,6 +1875,7 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr) /* Resend all requests through the MDS */ nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true, hdr->completion_ops); + set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags); return nfs_pageio_resend(&pgio, hdr); } EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); @@ -1865,6 +1920,7 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, mirror->pg_recoalesce = 1; } nfs_pgio_data_destroy(hdr); + hdr->release(hdr); } static enum pnfs_try_status @@ -1979,6 +2035,7 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, mirror->pg_recoalesce = 1; } nfs_pgio_data_destroy(hdr); + hdr->release(hdr); } /* @@ -2203,13 +2260,12 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) if (ld->prepare_layoutcommit) { status = ld->prepare_layoutcommit(&data->args); if (status) { + put_rpccred(data->cred); spin_lock(&inode->i_lock); set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; - spin_unlock(&inode->i_lock); - put_rpccred(data->cred); - goto clear_layoutcommitting; + goto out_unlock; } } @@ -2247,3 +2303,67 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) } return thp; } + +#if IS_ENABLED(CONFIG_NFS_V4_2) +int +pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags) +{ + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs42_layoutstat_data *data; + struct pnfs_layout_hdr *hdr; + int status = 0; + + if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats) + goto out; + + if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS)) + goto out; + + if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags)) + goto out; + + spin_lock(&inode->i_lock); + if (!NFS_I(inode)->layout) { + spin_unlock(&inode->i_lock); + goto out; + } + hdr = NFS_I(inode)->layout; + pnfs_get_layout_hdr(hdr); + spin_unlock(&inode->i_lock); + + data = kzalloc(sizeof(*data), gfp_flags); + if (!data) { + status = -ENOMEM; + goto out_put; + } + + data->args.fh = NFS_FH(inode); + data->args.inode = inode; + nfs4_stateid_copy(&data->args.stateid, &hdr->plh_stateid); + status = ld->prepare_layoutstats(&data->args); + if (status) + goto out_free; + + status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data); + +out: + dprintk("%s returns %d\n", __func__, status); + return status; + +out_free: + kfree(data); +out_put: + pnfs_put_layout_hdr(hdr); + smp_mb__before_atomic(); + clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags); + smp_mb__after_atomic(); + goto out; +} +EXPORT_SYMBOL_GPL(pnfs_report_layoutstat); +#endif + +unsigned int layoutstats_timer; +module_param(layoutstats_timer, uint, 0644); +EXPORT_SYMBOL_GPL(layoutstats_timer); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 1e6308f82fc3..78c9351ff117 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -94,7 +94,6 @@ enum { NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ - NFS_LAYOUT_ROC, /* some lseg had roc bit set */ NFS_LAYOUT_RETURN, /* Return this layout ASAP */ NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */ NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ @@ -129,6 +128,9 @@ struct pnfs_layoutdriver_type { struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); void (*free_lseg) (struct pnfs_layout_segment *lseg); + void (*add_lseg) (struct pnfs_layout_hdr *layoutid, + struct pnfs_layout_segment *lseg, + struct list_head *free_me); void (*return_range) (struct pnfs_layout_hdr *lo, struct pnfs_layout_range *range); @@ -178,19 +180,21 @@ struct pnfs_layoutdriver_type { void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, const struct nfs4_layoutcommit_args *args); + int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args); + void (*cleanup_layoutstats) (struct nfs42_layoutstat_data *data); }; struct pnfs_layout_hdr { atomic_t plh_refcount; + atomic_t plh_outstanding; /* number of RPCs out */ struct list_head plh_layouts; /* other client layouts */ struct list_head plh_bulk_destroy; struct list_head plh_segs; /* layout segments list */ - nfs4_stateid plh_stateid; - atomic_t plh_outstanding; /* number of RPCs out */ unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ - u32 plh_barrier; /* ignore lower seqids */ unsigned long plh_retry_timestamp; unsigned long plh_flags; + nfs4_stateid plh_stateid; + u32 plh_barrier; /* ignore lower seqids */ enum pnfs_iomode plh_return_iomode; loff_t plh_lwb; /* last write byte for layoutcommit */ struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ @@ -265,7 +269,7 @@ int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); -bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); +void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier); void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); @@ -284,13 +288,20 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, gfp_t gfp_flags); void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo); +void pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg, + bool (*is_after)(const struct pnfs_layout_range *lseg_range, + const struct pnfs_layout_range *old), + bool (*do_merge)(struct pnfs_layout_segment *lseg, + struct pnfs_layout_segment *old), + struct list_head *free_me); + void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *); int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *); struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); void pnfs_error_mark_layout_for_return(struct inode *inode, struct pnfs_layout_segment *lseg); - /* nfs4_deviceid_flags */ enum { NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ @@ -528,12 +539,31 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src, nfss->pnfs_curr_ld->id == src->l_type); } +static inline u64 +pnfs_calc_offset_end(u64 offset, u64 len) +{ + if (len == NFS4_MAX_UINT64 || len >= NFS4_MAX_UINT64 - offset) + return NFS4_MAX_UINT64; + return offset + len - 1; +} + +static inline u64 +pnfs_calc_offset_length(u64 offset, u64 end) +{ + if (end == NFS4_MAX_UINT64 || end <= offset) + return NFS4_MAX_UINT64; + return 1 + end - offset; +} + +extern unsigned int layoutstats_timer; + #ifdef NFS_DEBUG void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id); #else static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id) { } + #endif /* NFS_DEBUG */ #else /* CONFIG_NFS_V4_1 */ @@ -604,10 +634,9 @@ pnfs_roc_set_barrier(struct inode *ino, u32 barrier) { } -static inline bool -pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) +static inline void +pnfs_roc_get_barrier(struct inode *ino, u32 *barrier) { - return false; } static inline void set_pnfs_layoutdriver(struct nfs_server *s, @@ -689,4 +718,14 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void) #endif /* CONFIG_NFS_V4_1 */ +#if IS_ENABLED(CONFIG_NFS_V4_2) +int pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags); +#else +static inline int +pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags) +{ + return 0; +} +#endif + #endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index f37e25b6311c..24655b807d44 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -124,11 +124,12 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, if (ret) { cinfo->ds->nwritten -= ret; cinfo->ds->ncommitting += ret; - bucket->clseg = bucket->wlseg; - if (list_empty(src)) + if (bucket->clseg == NULL) + bucket->clseg = pnfs_get_lseg(bucket->wlseg); + if (list_empty(src)) { + pnfs_put_lseg_locked(bucket->wlseg); bucket->wlseg = NULL; - else - pnfs_get_lseg(bucket->clseg); + } } return ret; } @@ -182,19 +183,23 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx) struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; struct pnfs_commit_bucket *bucket; struct pnfs_layout_segment *freeme; + LIST_HEAD(pages); int i; + spin_lock(cinfo->lock); for (i = idx; i < fl_cinfo->nbuckets; i++) { bucket = &fl_cinfo->buckets[i]; if (list_empty(&bucket->committing)) continue; - nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo, i); - spin_lock(cinfo->lock); freeme = bucket->clseg; bucket->clseg = NULL; + list_splice_init(&bucket->committing, &pages); spin_unlock(cinfo->lock); + nfs_retry_commit(&pages, freeme, cinfo, i); pnfs_put_lseg(freeme); + spin_lock(cinfo->lock); } + spin_unlock(cinfo->lock); } static unsigned int @@ -216,10 +221,6 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo, if (!data) break; data->ds_commit_index = i; - spin_lock(cinfo->lock); - data->lseg = bucket->clseg; - bucket->clseg = NULL; - spin_unlock(cinfo->lock); list_add(&data->pages, list); nreq++; } @@ -229,6 +230,22 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo, return nreq; } +static inline +void pnfs_fetch_commit_bucket_list(struct list_head *pages, + struct nfs_commit_data *data, + struct nfs_commit_info *cinfo) +{ + struct pnfs_commit_bucket *bucket; + + bucket = &cinfo->ds->buckets[data->ds_commit_index]; + spin_lock(cinfo->lock); + list_splice_init(&bucket->committing, pages); + data->lseg = bucket->clseg; + bucket->clseg = NULL; + spin_unlock(cinfo->lock); + +} + /* This follows nfs_commit_list pretty closely */ int pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, @@ -243,7 +260,7 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, if (!list_empty(mds_pages)) { data = nfs_commitdata_alloc(); if (data != NULL) { - data->lseg = NULL; + data->ds_commit_index = -1; list_add(&data->pages, &list); nreq++; } else { @@ -265,19 +282,16 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, list_for_each_entry_safe(data, tmp, &list, pages) { list_del_init(&data->pages); - if (!data->lseg) { + if (data->ds_commit_index < 0) { nfs_init_commit(data, mds_pages, NULL, cinfo); nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(data->inode), data->mds_ops, how, 0); } else { - struct pnfs_commit_bucket *buckets; + LIST_HEAD(pages); - buckets = cinfo->ds->buckets; - nfs_init_commit(data, - &buckets[data->ds_commit_index].committing, - data->lseg, - cinfo); + pnfs_fetch_commit_bucket_list(&pages, data, cinfo); + nfs_init_commit(data, &pages, data->lseg, cinfo); initiate_commit(data, how); } } @@ -359,26 +373,31 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2) return false; } +/* + * Checks if 'dsaddrs1' contains a subset of 'dsaddrs2'. If it does, + * declare a match. + */ static bool _same_data_server_addrs_locked(const struct list_head *dsaddrs1, const struct list_head *dsaddrs2) { struct nfs4_pnfs_ds_addr *da1, *da2; - - /* step through both lists, comparing as we go */ - for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node), - da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node); - da1 != NULL && da2 != NULL; - da1 = list_entry(da1->da_node.next, typeof(*da1), da_node), - da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) { - if (!same_sockaddr((struct sockaddr *)&da1->da_addr, - (struct sockaddr *)&da2->da_addr)) - return false; + struct sockaddr *sa1, *sa2; + bool match = false; + + list_for_each_entry(da1, dsaddrs1, da_node) { + sa1 = (struct sockaddr *)&da1->da_addr; + match = false; + list_for_each_entry(da2, dsaddrs2, da_node) { + sa2 = (struct sockaddr *)&da2->da_addr; + match = same_sockaddr(sa1, sa2); + if (match) + break; + } + if (!match) + break; } - if (da1 == NULL && da2 == NULL) - return true; - - return false; + return match; } /* @@ -863,9 +882,10 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, } set_bit(PG_COMMIT_TO_DS, &req->wb_flags); cinfo->ds->nwritten++; - spin_unlock(cinfo->lock); - nfs_request_add_commit_list(req, list, cinfo); + nfs_request_add_commit_list_locked(req, list, cinfo); + spin_unlock(cinfo->lock); + nfs_mark_page_unstable(req->wb_page, cinfo); } EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f175b833b6ba..383a027de452 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -381,9 +381,12 @@ int __init register_nfs_fs(void) ret = nfs_register_sysctl(); if (ret < 0) goto error_2; - register_shrinker(&acl_shrinker); + ret = register_shrinker(&acl_shrinker); + if (ret < 0) + goto error_3; return 0; - +error_3: + nfs_unregister_sysctl(); error_2: unregister_nfs4_fs(); error_1: @@ -2847,7 +2850,7 @@ static int param_set_portnr(const char *val, const struct kernel_param *kp) *((unsigned int *)kp->arg) = num; return 0; } -static struct kernel_param_ops param_ops_portnr = { +static const struct kernel_param_ops param_ops_portnr = { .set = param_set_portnr, .get = param_get_uint, }; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e6c262555e08..388f48079c43 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -768,6 +768,28 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, } /** + * nfs_request_add_commit_list_locked - add request to a commit list + * @req: pointer to a struct nfs_page + * @dst: commit list head + * @cinfo: holds list lock and accounting info + * + * This sets the PG_CLEAN bit, updates the cinfo count of + * number of outstanding requests requiring a commit as well as + * the MM page stats. + * + * The caller must hold the cinfo->lock, and the nfs_page lock. + */ +void +nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, + struct nfs_commit_info *cinfo) +{ + set_bit(PG_CLEAN, &req->wb_flags); + nfs_list_add_request(req, dst); + cinfo->mds->ncommit++; +} +EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); + +/** * nfs_request_add_commit_list - add request to a commit list * @req: pointer to a struct nfs_page * @dst: commit list head @@ -784,13 +806,10 @@ void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, struct nfs_commit_info *cinfo) { - set_bit(PG_CLEAN, &(req)->wb_flags); spin_lock(cinfo->lock); - nfs_list_add_request(req, dst); - cinfo->mds->ncommit++; + nfs_request_add_commit_list_locked(req, dst, cinfo); spin_unlock(cinfo->lock); - if (!cinfo->dreq) - nfs_mark_page_unstable(req->wb_page); + nfs_mark_page_unstable(req->wb_page, cinfo); } EXPORT_SYMBOL_GPL(nfs_request_add_commit_list); @@ -1290,6 +1309,7 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr, static void nfs_redirty_request(struct nfs_page *req) { nfs_mark_request_dirty(req); + set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); nfs_unlock_request(req); nfs_end_page_writeback(req); nfs_release_request(req); @@ -1348,11 +1368,6 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata) NFS_PROTO(data->inode)->commit_rpc_prepare(task, data); } -static void nfs_writeback_release_common(struct nfs_pgio_header *hdr) -{ - /* do nothing! */ -} - /* * Special version of should_remove_suid() that ignores capabilities. */ @@ -1383,24 +1398,27 @@ static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr, { struct nfs_pgio_args *argp = &hdr->args; struct nfs_pgio_res *resp = &hdr->res; + u64 size = argp->offset + resp->count; if (!(fattr->valid & NFS_ATTR_FATTR_SIZE)) + fattr->size = size; + if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode)) { + fattr->valid &= ~NFS_ATTR_FATTR_SIZE; return; - if (argp->offset + resp->count != fattr->size) - return; - if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode)) + } + if (size != fattr->size) return; /* Set attribute barrier */ nfs_fattr_set_barrier(fattr); + /* ...and update size */ + fattr->valid |= NFS_ATTR_FATTR_SIZE; } void nfs_writeback_update_inode(struct nfs_pgio_header *hdr) { - struct nfs_fattr *fattr = hdr->res.fattr; + struct nfs_fattr *fattr = &hdr->fattr; struct inode *inode = hdr->inode; - if (fattr == NULL) - return; spin_lock(&inode->i_lock); nfs_writeback_check_extend(hdr, fattr); nfs_post_op_update_inode_force_wcc_locked(inode, fattr); @@ -1556,7 +1574,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, /* Set up the initial task struct. */ nfs_ops->commit_setup(data, &msg); - dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); + dprintk("NFS: initiated commit call\n"); nfs4_state_protect(NFS_SERVER(data->inode)->nfs_client, NFS_SP4_MACH_CRED_COMMIT, &task_setup_data.rpc_client, &msg); @@ -1794,7 +1812,7 @@ out_mark_dirty: return res; } -static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) +int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct nfs_inode *nfsi = NFS_I(inode); int flags = FLUSH_SYNC; @@ -1829,11 +1847,6 @@ out_mark_dirty: __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; } - -int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) -{ - return nfs_commit_unstable_pages(inode, wbc); -} EXPORT_SYMBOL_GPL(nfs_write_inode); /* @@ -2013,7 +2026,6 @@ static const struct nfs_rw_ops nfs_rw_write_ops = { .rw_mode = FMODE_WRITE, .rw_alloc_header = nfs_writehdr_alloc, .rw_free_header = nfs_writehdr_free, - .rw_release = nfs_writeback_release_common, .rw_done = nfs_writeback_done, .rw_result = nfs_writeback_result, .rw_initiate = nfs_initiate_write, diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index ae6e58ea4de5..fd8c9a5bcac4 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -63,14 +63,33 @@ EXPORT_SYMBOL_GPL(locks_end_grace); * lock reclaims. */ int -locks_in_grace(struct net *net) +__state_in_grace(struct net *net, bool open) { struct list_head *grace_list = net_generic(net, grace_net_id); + struct lock_manager *lm; - return !list_empty(grace_list); + if (!open) + return !list_empty(grace_list); + + list_for_each_entry(lm, grace_list, list) { + if (lm->block_opens) + return true; + } + return false; +} + +int locks_in_grace(struct net *net) +{ + return __state_in_grace(net, 0); } EXPORT_SYMBOL_GPL(locks_in_grace); +int opens_in_grace(struct net *net) +{ + return __state_in_grace(net, 1); +} +EXPORT_SYMBOL_GPL(opens_in_grace); + static int __net_init grace_init_net(struct net *net) { diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 9aa2796da90d..6d834dc9bbc8 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -101,7 +101,7 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, } nr_iomaps = be32_to_cpup(p++); - expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE; + expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE; if (len != expected) { dprintk("%s: extent array size mismatch: %u/%u\n", __func__, len, expected); diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h index fdc79037c0e7..6de925fe8499 100644 --- a/fs/nfsd/blocklayoutxdr.h +++ b/fs/nfsd/blocklayoutxdr.h @@ -7,13 +7,6 @@ struct iomap; struct xdr_stream; -enum pnfs_block_extent_state { - PNFS_BLOCK_READWRITE_DATA = 0, - PNFS_BLOCK_READ_DATA = 1, - PNFS_BLOCK_INVALID_DATA = 2, - PNFS_BLOCK_NONE_DATA = 3, -}; - struct pnfs_block_extent { struct nfsd4_deviceid vol_id; u64 foff; @@ -21,14 +14,6 @@ struct pnfs_block_extent { u64 soff; enum pnfs_block_extent_state es; }; -#define NFS4_BLOCK_EXTENT_SIZE 44 - -enum pnfs_block_volume_type { - PNFS_BLOCK_VOLUME_SIMPLE = 0, - PNFS_BLOCK_VOLUME_SLICE = 1, - PNFS_BLOCK_VOLUME_CONCAT = 2, - PNFS_BLOCK_VOLUME_STRIPE = 3, -}; /* * Random upper cap for the uuid length to avoid unbounded allocation. diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index f79521a59747..b4d84b579f20 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1075,73 +1075,6 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) return rv; } -/* Iterator */ - -static void *e_start(struct seq_file *m, loff_t *pos) - __acquires(((struct cache_detail *)m->private)->hash_lock) -{ - loff_t n = *pos; - unsigned hash, export; - struct cache_head *ch; - struct cache_detail *cd = m->private; - struct cache_head **export_table = cd->hash_table; - - read_lock(&cd->hash_lock); - if (!n--) - return SEQ_START_TOKEN; - hash = n >> 32; - export = n & ((1LL<<32) - 1); - - - for (ch=export_table[hash]; ch; ch=ch->next) - if (!export--) - return ch; - n &= ~((1LL<<32) - 1); - do { - hash++; - n += 1LL<<32; - } while(hash < EXPORT_HASHMAX && export_table[hash]==NULL); - if (hash >= EXPORT_HASHMAX) - return NULL; - *pos = n+1; - return export_table[hash]; -} - -static void *e_next(struct seq_file *m, void *p, loff_t *pos) -{ - struct cache_head *ch = p; - int hash = (*pos >> 32); - struct cache_detail *cd = m->private; - struct cache_head **export_table = cd->hash_table; - - if (p == SEQ_START_TOKEN) - hash = 0; - else if (ch->next == NULL) { - hash++; - *pos += 1LL<<32; - } else { - ++*pos; - return ch->next; - } - *pos &= ~((1LL<<32) - 1); - while (hash < EXPORT_HASHMAX && export_table[hash] == NULL) { - hash++; - *pos += 1LL<<32; - } - if (hash >= EXPORT_HASHMAX) - return NULL; - ++*pos; - return export_table[hash]; -} - -static void e_stop(struct seq_file *m, void *p) - __releases(((struct cache_detail *)m->private)->hash_lock) -{ - struct cache_detail *cd = m->private; - - read_unlock(&cd->hash_lock); -} - static struct flags { int flag; char *name[2]; @@ -1270,9 +1203,9 @@ static int e_show(struct seq_file *m, void *p) } const struct seq_operations nfs_exports_op = { - .start = e_start, - .next = e_next, - .stop = e_stop, + .start = cache_seq_start, + .next = cache_seq_next, + .stop = cache_seq_stop, .show = e_show, }; diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index 1f52bfcc436f..2e315072bf3f 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -6,6 +6,7 @@ #include <linux/sunrpc/cache.h> #include <uapi/linux/nfsd/export.h> +#include <linux/nfs4.h> struct knfsd_fh; struct svc_fh; diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h index a3f34900091f..23cc85d1efdd 100644 --- a/fs/nfsd/idmap.h +++ b/fs/nfsd/idmap.h @@ -37,9 +37,7 @@ #include <linux/in.h> #include <linux/sunrpc/svc.h> - -/* XXX from linux/nfs_idmap.h */ -#define IDMAP_NAMESZ 128 +#include <linux/nfs_idmap.h> #ifdef CONFIG_NFSD_V4 int nfsd_idmap_init(struct net *); diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index ea6749a32760..d8b16c2568f3 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -110,6 +110,7 @@ struct nfsd_net { unsigned int max_connections; u32 clientid_counter; + u32 clverifier_counter; struct svc_serv *nfsd_serv; }; diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index d54701f6dc78..1580ea6fd64d 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -44,13 +44,13 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, inode = d_inode(fh->fh_dentry); - if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) + if (argp->mask & ~NFS_ACL_MASK) RETURN_STATUS(nfserr_inval); resp->mask = argp->mask; nfserr = fh_getattr(fh, &resp->stat); if (nfserr) - goto fail; + RETURN_STATUS(nfserr); if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { acl = get_acl(inode, ACL_TYPE_ACCESS); @@ -202,7 +202,7 @@ static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p, if (!p) return 0; argp->mask = ntohl(*p++); - if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) || + if (argp->mask & ~NFS_ACL_MASK || !xdr_argsize_check(rqstp, p)) return 0; @@ -293,9 +293,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, resp->acl_default, resp->mask & NFS_DFACL, NFS_ACL_DEFAULT); - if (n <= 0) - return 0; - return 1; + return (n > 0); } static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p, diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 882b1a14bc3e..01df4cd7c753 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -41,7 +41,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp, inode = d_inode(fh->fh_dentry); - if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) + if (argp->mask & ~NFS_ACL_MASK) RETURN_STATUS(nfserr_inval); resp->mask = argp->mask; @@ -148,7 +148,7 @@ static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p, if (!p) return 0; args->mask = ntohl(*p++); - if (args->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) || + if (args->mask & ~NFS_ACL_MASK || !xdr_argsize_check(rqstp, p)) return 0; diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index eb5accf1b37f..6adabd6049b7 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -34,8 +34,10 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include <linux/fs.h> #include <linux/slab.h> -#include <linux/nfs_fs.h> +#include <linux/posix_acl.h> + #include "nfsfh.h" #include "nfsd.h" #include "acl.h" @@ -100,7 +102,7 @@ deny_mask_from_posix(unsigned short perm, u32 flags) /* We only map from NFSv4 to POSIX ACLs when setting ACLs, when we err on the * side of being more restrictive, so the mode bit mapping below is * pessimistic. An optimistic version would be needed to handle DENY's, - * but we espect to coalesce all ALLOWs and DENYs before mapping to mode + * but we expect to coalesce all ALLOWs and DENYs before mapping to mode * bits. */ static void @@ -458,7 +460,7 @@ init_state(struct posix_acl_state *state, int cnt) state->empty = 1; /* * In the worst case, each individual acl could be for a distinct - * named user or group, but we don't no which, so we allocate + * named user or group, but we don't know which, so we allocate * enough space for either: */ alloc = sizeof(struct posix_ace_state_array) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index a49201835a97..e7f50c4081d6 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -435,12 +435,12 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr, */ status = 0; out: - if (status) - nfsd4_mark_cb_fault(cb->cb_clp, status); + cb->cb_seq_status = status; return status; out_overflow: print_overflow_msg(__func__, xdr); - return -EIO; + status = -EIO; + goto out; } static int decode_cb_sequence4res(struct xdr_stream *xdr, @@ -451,11 +451,10 @@ static int decode_cb_sequence4res(struct xdr_stream *xdr, if (cb->cb_minorversion == 0) return 0; - status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_status); - if (unlikely(status || cb->cb_status)) + status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_seq_status); + if (unlikely(status || cb->cb_seq_status)) return status; - cb->cb_update_seq_nr = true; return decode_cb_sequence4resok(xdr, cb); } @@ -527,7 +526,7 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, if (cb != NULL) { status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_status)) + if (unlikely(status || cb->cb_seq_status)) return status; } @@ -617,7 +616,7 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, if (cb) { status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_status)) + if (unlikely(status || cb->cb_seq_status)) return status; } return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status); @@ -876,7 +875,11 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) u32 minorversion = clp->cl_minorversion; cb->cb_minorversion = minorversion; - cb->cb_update_seq_nr = false; + /* + * cb_seq_status is only set in decode_cb_sequence4res, + * and so will remain 1 if an rpc level failure occurs. + */ + cb->cb_seq_status = 1; cb->cb_status = 0; if (minorversion) { if (!nfsd41_cb_get_slot(clp, task)) @@ -885,15 +888,30 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) rpc_call_start(task); } -static void nfsd4_cb_done(struct rpc_task *task, void *calldata) +static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback *cb) { - struct nfsd4_callback *cb = calldata; struct nfs4_client *clp = cb->cb_clp; + struct nfsd4_session *session = clp->cl_cb_session; + bool ret = true; - dprintk("%s: minorversion=%d\n", __func__, - clp->cl_minorversion); + if (!clp->cl_minorversion) { + /* + * If the backchannel connection was shut down while this + * task was queued, we need to resubmit it after setting up + * a new backchannel connection. + * + * Note that if we lost our callback connection permanently + * the submission code will error out, so we don't need to + * handle that case here. + */ + if (task->tk_flags & RPC_TASK_KILLED) + goto need_restart; + + return true; + } - if (clp->cl_minorversion) { + switch (cb->cb_seq_status) { + case 0: /* * No need for lock, access serialized in nfsd4_cb_prepare * @@ -901,29 +919,63 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) * If CB_SEQUENCE returns an error, then the state of the slot * (sequence ID, cached reply) MUST NOT change. */ - if (cb->cb_update_seq_nr) - ++clp->cl_cb_session->se_cb_seq_nr; - - clear_bit(0, &clp->cl_cb_slot_busy); - rpc_wake_up_next(&clp->cl_cb_waitq); - dprintk("%s: freed slot, new seqid=%d\n", __func__, - clp->cl_cb_session->se_cb_seq_nr); + ++session->se_cb_seq_nr; + break; + case -ESERVERFAULT: + ++session->se_cb_seq_nr; + case 1: + case -NFS4ERR_BADSESSION: + nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status); + ret = false; + break; + case -NFS4ERR_DELAY: + if (!rpc_restart_call(task)) + goto out; + + rpc_delay(task, 2 * HZ); + return false; + case -NFS4ERR_BADSLOT: + goto retry_nowait; + case -NFS4ERR_SEQ_MISORDERED: + if (session->se_cb_seq_nr != 1) { + session->se_cb_seq_nr = 1; + goto retry_nowait; + } + break; + default: + dprintk("%s: unprocessed error %d\n", __func__, + cb->cb_seq_status); } - /* - * If the backchannel connection was shut down while this - * task was queued, we need to resubmit it after setting up - * a new backchannel connection. - * - * Note that if we lost our callback connection permanently - * the submission code will error out, so we don't need to - * handle that case here. - */ - if (task->tk_flags & RPC_TASK_KILLED) { - task->tk_status = 0; - cb->cb_need_restart = true; + clear_bit(0, &clp->cl_cb_slot_busy); + rpc_wake_up_next(&clp->cl_cb_waitq); + dprintk("%s: freed slot, new seqid=%d\n", __func__, + clp->cl_cb_session->se_cb_seq_nr); + + if (task->tk_flags & RPC_TASK_KILLED) + goto need_restart; +out: + return ret; +retry_nowait: + if (rpc_restart_call_prepare(task)) + ret = false; + goto out; +need_restart: + task->tk_status = 0; + cb->cb_need_restart = true; + return false; +} + +static void nfsd4_cb_done(struct rpc_task *task, void *calldata) +{ + struct nfsd4_callback *cb = calldata; + struct nfs4_client *clp = cb->cb_clp; + + dprintk("%s: minorversion=%d\n", __func__, + clp->cl_minorversion); + + if (!nfsd4_cb_sequence_done(task, cb)) return; - } if (cb->cb_status) { WARN_ON_ONCE(task->tk_status); @@ -1099,8 +1151,8 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_msg.rpc_resp = cb; cb->cb_ops = ops; INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); + cb->cb_seq_status = 1; cb->cb_status = 0; - cb->cb_update_seq_nr = false; cb->cb_need_restart = false; } diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index e1b3d3d472da..5b20577dcdd2 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -59,9 +59,6 @@ MODULE_PARM_DESC(nfs4_disable_idmapping, * that. */ -#define IDMAP_TYPE_USER 0 -#define IDMAP_TYPE_GROUP 1 - struct ent { struct cache_head h; int type; /* User / Group */ diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 6904213a4363..ebf90e487c75 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -212,6 +212,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, BUG_ON(!ls->ls_file); if (nfsd4_layout_setlease(ls)) { + fput(ls->ls_file); put_nfs4_file(fp); kmem_cache_free(nfs4_layout_stateid_cache, ls); return NULL; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 90cfda75313c..4ce6b97b31ad 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -276,13 +276,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval); /* - * Following rfc 3530 14.2.16, use the returned bitmask - * to indicate which attributes we used to store the - * verifier: + * Following rfc 3530 14.2.16, and rfc 5661 18.16.4 + * use the returned bitmask to indicate which attributes + * we used to store the verifier: */ - if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0) - open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS | - FATTR4_WORD1_TIME_MODIFY); + if (nfsd_create_is_exclusive(open->op_createmode) && status == 0) + open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS | + FATTR4_WORD1_TIME_MODIFY); } else /* * Note this may exit with the parent still locked. @@ -362,7 +362,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { __be32 status; struct svc_fh *resfh = NULL; - struct nfsd4_compoundres *resp; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -389,8 +388,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, copy_clientid(&open->op_clientid, cstate->session); /* check seqid for replay. set nfs4_owner */ - resp = rqstp->rq_resp; - status = nfsd4_process_open1(&resp->cstate, open, nn); + status = nfsd4_process_open1(cstate, open, nn); if (status == nfserr_replay_me) { struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay; fh_put(&cstate->current_fh); @@ -417,10 +415,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Openowner is now set, so sequence id will get bumped. Now we need * these checks before we do any creates: */ status = nfserr_grace; - if (locks_in_grace(net) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) + if (opens_in_grace(net) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) goto out; status = nfserr_no_grace; - if (!locks_in_grace(net) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) + if (!opens_in_grace(net) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) goto out; switch (open->op_claim_type) { @@ -829,7 +827,7 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { __be32 status; - if (locks_in_grace(SVC_NET(rqstp))) + if (opens_in_grace(SVC_NET(rqstp))) return nfserr_grace; status = nfsd_unlink(rqstp, &cstate->current_fh, 0, remove->rm_name, remove->rm_namelen); @@ -848,7 +846,7 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!cstate->save_fh.fh_dentry) return status; - if (locks_in_grace(SVC_NET(rqstp)) && + if (opens_in_grace(SVC_NET(rqstp)) && !(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK)) return nfserr_grace; status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname, @@ -1364,10 +1362,6 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, goto out; } - nfserr = ops->proc_layoutcommit(inode, lcp); - if (nfserr) - goto out_put_stid; - if (new_size > i_size_read(inode)) { lcp->lc_size_chg = 1; lcp->lc_newsize = new_size; @@ -1375,7 +1369,7 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, lcp->lc_size_chg = 0; } -out_put_stid: + nfserr = ops->proc_layoutcommit(inode, lcp); nfs4_put_stid(&ls->ls_stid); out: return nfserr; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index d88ea7b9a85c..e3d47091b191 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -272,6 +272,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) .ctx.actor = nfsd4_build_namelist, .names = LIST_HEAD_INIT(ctx.names) }; + struct name_list *entry, *tmp; int status; status = nfs4_save_creds(&original_cred); @@ -286,9 +287,8 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) status = iterate_dir(nn->rec_file, &ctx.ctx); mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); - while (!list_empty(&ctx.names)) { - struct name_list *entry; - entry = list_entry(ctx.names.next, struct name_list, list); + + list_for_each_entry_safe(entry, tmp, &ctx.names, list) { if (!status) { struct dentry *dentry; dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1); @@ -304,6 +304,12 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) } mutex_unlock(&d_inode(dir)->i_mutex); nfs4_reset_creds(original_cred); + + list_for_each_entry_safe(entry, tmp, &ctx.names, list) { + dprintk("NFSD: %s. Left entry %s\n", __func__, entry->name); + list_del(&entry->list); + kfree(entry); + } return status; } @@ -541,8 +547,7 @@ nfsd4_legacy_tracking_init(struct net *net) /* XXX: The legacy code won't work in a container */ if (net != &init_net) { - WARN(1, KERN_ERR "NFSD: attempt to initialize legacy client " - "tracking in a container!\n"); + pr_warn("NFSD: attempt to initialize legacy client tracking in a container ignored.\n"); return -EINVAL; } @@ -1254,8 +1259,7 @@ nfsd4_umh_cltrack_init(struct net *net) /* XXX: The usermode helper s not working in container yet. */ if (net != &init_net) { - WARN(1, KERN_ERR "NFSD: attempt to initialize umh client " - "tracking in a container!\n"); + pr_warn("NFSD: attempt to initialize umh client tracking in a container ignored.\n"); return -EINVAL; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 61dfb33f0559..0f1d5691b795 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -777,13 +777,16 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); } -static void +static bool unhash_delegation_locked(struct nfs4_delegation *dp) { struct nfs4_file *fp = dp->dl_stid.sc_file; lockdep_assert_held(&state_lock); + if (list_empty(&dp->dl_perfile)) + return false; + dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID; /* Ensure that deleg break won't try to requeue it */ ++dp->dl_time; @@ -792,16 +795,21 @@ unhash_delegation_locked(struct nfs4_delegation *dp) list_del_init(&dp->dl_recall_lru); list_del_init(&dp->dl_perfile); spin_unlock(&fp->fi_lock); + return true; } static void destroy_delegation(struct nfs4_delegation *dp) { + bool unhashed; + spin_lock(&state_lock); - unhash_delegation_locked(dp); + unhashed = unhash_delegation_locked(dp); spin_unlock(&state_lock); - put_clnt_odstate(dp->dl_clnt_odstate); - nfs4_put_deleg_lease(dp->dl_stid.sc_file); - nfs4_put_stid(&dp->dl_stid); + if (unhashed) { + put_clnt_odstate(dp->dl_clnt_odstate); + nfs4_put_deleg_lease(dp->dl_stid.sc_file); + nfs4_put_stid(&dp->dl_stid); + } } static void revoke_delegation(struct nfs4_delegation *dp) @@ -990,6 +998,12 @@ release_all_access(struct nfs4_ol_stateid *stp) } } +static inline void nfs4_free_stateowner(struct nfs4_stateowner *sop) +{ + kfree(sop->so_owner.data); + sop->so_ops->so_free(sop); +} + static void nfs4_put_stateowner(struct nfs4_stateowner *sop) { struct nfs4_client *clp = sop->so_client; @@ -1000,20 +1014,23 @@ static void nfs4_put_stateowner(struct nfs4_stateowner *sop) return; sop->so_ops->so_unhash(sop); spin_unlock(&clp->cl_lock); - kfree(sop->so_owner.data); - sop->so_ops->so_free(sop); + nfs4_free_stateowner(sop); } -static void unhash_ol_stateid(struct nfs4_ol_stateid *stp) +static bool unhash_ol_stateid(struct nfs4_ol_stateid *stp) { struct nfs4_file *fp = stp->st_stid.sc_file; lockdep_assert_held(&stp->st_stateowner->so_client->cl_lock); + if (list_empty(&stp->st_perfile)) + return false; + spin_lock(&fp->fi_lock); - list_del(&stp->st_perfile); + list_del_init(&stp->st_perfile); spin_unlock(&fp->fi_lock); list_del(&stp->st_perstateowner); + return true; } static void nfs4_free_ol_stateid(struct nfs4_stid *stid) @@ -1063,25 +1080,27 @@ static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp, list_add(&stp->st_locks, reaplist); } -static void unhash_lock_stateid(struct nfs4_ol_stateid *stp) +static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp) { struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner); lockdep_assert_held(&oo->oo_owner.so_client->cl_lock); list_del_init(&stp->st_locks); - unhash_ol_stateid(stp); nfs4_unhash_stid(&stp->st_stid); + return unhash_ol_stateid(stp); } static void release_lock_stateid(struct nfs4_ol_stateid *stp) { struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner); + bool unhashed; spin_lock(&oo->oo_owner.so_client->cl_lock); - unhash_lock_stateid(stp); + unhashed = unhash_lock_stateid(stp); spin_unlock(&oo->oo_owner.so_client->cl_lock); - nfs4_put_stid(&stp->st_stid); + if (unhashed) + nfs4_put_stid(&stp->st_stid); } static void unhash_lockowner_locked(struct nfs4_lockowner *lo) @@ -1129,7 +1148,7 @@ static void release_lockowner(struct nfs4_lockowner *lo) while (!list_empty(&lo->lo_owner.so_stateids)) { stp = list_first_entry(&lo->lo_owner.so_stateids, struct nfs4_ol_stateid, st_perstateowner); - unhash_lock_stateid(stp); + WARN_ON(!unhash_lock_stateid(stp)); put_ol_stateid_locked(stp, &reaplist); } spin_unlock(&clp->cl_lock); @@ -1142,21 +1161,26 @@ static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp, { struct nfs4_ol_stateid *stp; + lockdep_assert_held(&open_stp->st_stid.sc_client->cl_lock); + while (!list_empty(&open_stp->st_locks)) { stp = list_entry(open_stp->st_locks.next, struct nfs4_ol_stateid, st_locks); - unhash_lock_stateid(stp); + WARN_ON(!unhash_lock_stateid(stp)); put_ol_stateid_locked(stp, reaplist); } } -static void unhash_open_stateid(struct nfs4_ol_stateid *stp, +static bool unhash_open_stateid(struct nfs4_ol_stateid *stp, struct list_head *reaplist) { + bool unhashed; + lockdep_assert_held(&stp->st_stid.sc_client->cl_lock); - unhash_ol_stateid(stp); + unhashed = unhash_ol_stateid(stp); release_open_stateid_locks(stp, reaplist); + return unhashed; } static void release_open_stateid(struct nfs4_ol_stateid *stp) @@ -1164,8 +1188,8 @@ static void release_open_stateid(struct nfs4_ol_stateid *stp) LIST_HEAD(reaplist); spin_lock(&stp->st_stid.sc_client->cl_lock); - unhash_open_stateid(stp, &reaplist); - put_ol_stateid_locked(stp, &reaplist); + if (unhash_open_stateid(stp, &reaplist)) + put_ol_stateid_locked(stp, &reaplist); spin_unlock(&stp->st_stid.sc_client->cl_lock); free_ol_stateid_reaplist(&reaplist); } @@ -1210,8 +1234,8 @@ static void release_openowner(struct nfs4_openowner *oo) while (!list_empty(&oo->oo_owner.so_stateids)) { stp = list_first_entry(&oo->oo_owner.so_stateids, struct nfs4_ol_stateid, st_perstateowner); - unhash_open_stateid(stp, &reaplist); - put_ol_stateid_locked(stp, &reaplist); + if (unhash_open_stateid(stp, &reaplist)) + put_ol_stateid_locked(stp, &reaplist); } spin_unlock(&clp->cl_lock); free_ol_stateid_reaplist(&reaplist); @@ -1714,7 +1738,7 @@ __destroy_client(struct nfs4_client *clp) spin_lock(&state_lock); while (!list_empty(&clp->cl_delegations)) { dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); - unhash_delegation_locked(dp); + WARN_ON(!unhash_delegation_locked(dp)); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); @@ -1894,7 +1918,7 @@ static void gen_confirm(struct nfs4_client *clp, struct nfsd_net *nn) * __force to keep sparse happy */ verf[0] = (__force __be32)get_seconds(); - verf[1] = (__force __be32)nn->clientid_counter; + verf[1] = (__force __be32)nn->clverifier_counter++; memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data)); } @@ -2241,6 +2265,9 @@ static bool client_has_state(struct nfs4_client *clp) * Also note we should probably be using this in 4.0 case too. */ return !list_empty(&clp->cl_openowners) +#ifdef CONFIG_NFSD_PNFS + || !list_empty(&clp->cl_lo_states) +#endif || !list_empty(&clp->cl_delegations) || !list_empty(&clp->cl_sessions); } @@ -2547,11 +2574,9 @@ nfsd4_create_session(struct svc_rqst *rqstp, goto out_free_conn; cs_slot = &conf->cl_cs_slot; status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); - if (status == nfserr_replay_cache) { - status = nfsd4_replay_create_session(cr_ses, cs_slot); - goto out_free_conn; - } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) { - status = nfserr_seq_misordered; + if (status) { + if (status == nfserr_replay_cache) + status = nfsd4_replay_create_session(cr_ses, cs_slot); goto out_free_conn; } } else if (unconf) { @@ -3041,10 +3066,11 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, unconf = find_unconfirmed_client_by_name(&clname, nn); if (unconf) unhash_client_locked(unconf); - if (conf && same_verf(&conf->cl_verifier, &clverifier)) + if (conf && same_verf(&conf->cl_verifier, &clverifier)) { /* case 1: probable callback update */ copy_clid(new, conf); - else /* case 4 (new client) or cases 2, 3 (client reboot): */ + gen_confirm(new, nn); + } else /* case 4 (new client) or cases 2, 3 (client reboot): */ gen_clid(new, nn); new->cl_minorversion = 0; gen_callback(new, setclid, rqstp); @@ -3085,10 +3111,11 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, /* * We try hard to give out unique clientid's, so if we get an * attempt to confirm the same clientid with a different cred, - * there's a bug somewhere. Let's charitably assume it's our - * bug. + * the client may be buggy; this should never happen. + * + * Nevertheless, RFC 7530 recommends INUSE for this case: */ - status = nfserr_serverfault; + status = nfserr_clid_inuse; if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred)) goto out; if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred)) @@ -3315,7 +3342,8 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, hash_openowner(oo, clp, strhashval); ret = oo; } else - nfs4_free_openowner(&oo->oo_owner); + nfs4_free_stateowner(&oo->oo_owner); + spin_unlock(&clp->cl_lock); return ret; } @@ -3482,6 +3510,9 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb, { struct nfs4_delegation *dp = cb_to_delegation(cb); + if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID) + return 1; + switch (task->tk_status) { case 0: return 1; @@ -3885,12 +3916,6 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c return status; } -static void -nfs4_set_claim_prev(struct nfsd4_open *open, bool has_session) -{ - open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; -} - /* Should we give out recallable state?: */ static bool nfsd4_cb_channel_good(struct nfs4_client *clp) { @@ -3923,7 +3948,7 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag) static int nfs4_setlease(struct nfs4_delegation *dp) { struct nfs4_file *fp = dp->dl_stid.sc_file; - struct file_lock *fl, *ret; + struct file_lock *fl; struct file *filp; int status = 0; @@ -3934,10 +3959,10 @@ static int nfs4_setlease(struct nfs4_delegation *dp) if (!filp) { /* We should always have a readable file here */ WARN_ON_ONCE(1); + locks_free_lock(fl); return -EBADF; } fl->fl_file = filp; - ret = fl; status = vfs_setlease(filp, fl->fl_type, &fl, NULL); if (fl) locks_free_lock(fl); @@ -4063,7 +4088,8 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, case NFS4_OPEN_CLAIM_FH: /* * Let's not give out any delegations till everyone's - * had the chance to reclaim theirs.... + * had the chance to reclaim theirs, *and* until + * NLM locks have all been reclaimed: */ if (locks_in_grace(clp->net)) goto out_no_deleg; @@ -4209,7 +4235,7 @@ out: if (fp) put_nfs4_file(fp); if (status == 0 && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) - nfs4_set_claim_prev(open, nfsd4_has_session(&resp->cstate)); + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; /* * To finish the open response, we just need to set the rflags. */ @@ -4338,14 +4364,12 @@ nfs4_laundromat(struct nfsd_net *nn) spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn) - continue; if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) { t = dp->dl_time - cutoff; new_timeo = min(new_timeo, t); break; } - unhash_delegation_locked(dp); + WARN_ON(!unhash_delegation_locked(dp)); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); @@ -4396,9 +4420,9 @@ laundromat_main(struct work_struct *laundry) queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ); } -static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) +static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) { - if (!fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle)) + if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle)) return nfserr_bad_stateid; return nfs_ok; } @@ -4440,7 +4464,7 @@ check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid, { if (ONE_STATEID(stateid) && (flags & RD_STATE)) return nfs_ok; - else if (locks_in_grace(net)) { + else if (opens_in_grace(net)) { /* Answer in remaining cases depends on existence of * conflicting state; so we must wait out the grace period. */ return nfserr_grace; @@ -4459,7 +4483,7 @@ check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid, static inline int grace_disallows_io(struct net *net, struct inode *inode) { - return locks_in_grace(net) && mandatory_lock(inode); + return opens_in_grace(net) && mandatory_lock(inode); } /* Returns true iff a is later than b: */ @@ -4601,9 +4625,6 @@ nfs4_check_olstateid(struct svc_fh *fhp, struct nfs4_ol_stateid *ols, int flags) { __be32 status; - status = nfs4_check_fh(fhp, ols); - if (status) - return status; status = nfsd4_check_openowner_confirmed(ols); if (status) return status; @@ -4690,6 +4711,9 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, status = nfserr_bad_stateid; break; } + if (status) + goto out; + status = nfs4_check_fh(fhp, s); done: if (!status && filpp) @@ -4751,7 +4775,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (check_for_locks(stp->st_stid.sc_file, lockowner(stp->st_stateowner))) break; - unhash_lock_stateid(stp); + WARN_ON(!unhash_lock_stateid(stp)); spin_unlock(&cl->cl_lock); nfs4_put_stid(s); ret = nfs_ok; @@ -4798,7 +4822,7 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); if (status) return status; - return nfs4_check_fh(current_fh, stp); + return nfs4_check_fh(current_fh, &stp->st_stid); } /* @@ -4967,20 +4991,23 @@ out: static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) { struct nfs4_client *clp = s->st_stid.sc_client; + bool unhashed; LIST_HEAD(reaplist); s->st_stid.sc_type = NFS4_CLOSED_STID; spin_lock(&clp->cl_lock); - unhash_open_stateid(s, &reaplist); + unhashed = unhash_open_stateid(s, &reaplist); if (clp->cl_minorversion) { - put_ol_stateid_locked(s, &reaplist); + if (unhashed) + put_ol_stateid_locked(s, &reaplist); spin_unlock(&clp->cl_lock); free_ol_stateid_reaplist(&reaplist); } else { spin_unlock(&clp->cl_lock); free_ol_stateid_reaplist(&reaplist); - move_to_close_lru(s, clp->net); + if (unhashed) + move_to_close_lru(s, clp->net); } } @@ -5045,9 +5072,6 @@ out: return status; } - -#define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start)) - static inline u64 end_offset(u64 start, u64 len) { @@ -5139,8 +5163,7 @@ nevermind: } static struct nfs4_lockowner * -find_lockowner_str_locked(clientid_t *clid, struct xdr_netobj *owner, - struct nfs4_client *clp) +find_lockowner_str_locked(struct nfs4_client *clp, struct xdr_netobj *owner) { unsigned int strhashval = ownerstr_hashval(owner); struct nfs4_stateowner *so; @@ -5158,13 +5181,12 @@ find_lockowner_str_locked(clientid_t *clid, struct xdr_netobj *owner, } static struct nfs4_lockowner * -find_lockowner_str(clientid_t *clid, struct xdr_netobj *owner, - struct nfs4_client *clp) +find_lockowner_str(struct nfs4_client *clp, struct xdr_netobj *owner) { struct nfs4_lockowner *lo; spin_lock(&clp->cl_lock); - lo = find_lockowner_str_locked(clid, owner, clp); + lo = find_lockowner_str_locked(clp, owner); spin_unlock(&clp->cl_lock); return lo; } @@ -5208,14 +5230,14 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, lo->lo_owner.so_seqid = lock->lk_new_lock_seqid; lo->lo_owner.so_ops = &lockowner_ops; spin_lock(&clp->cl_lock); - ret = find_lockowner_str_locked(&clp->cl_clientid, - &lock->lk_new_owner, clp); + ret = find_lockowner_str_locked(clp, &lock->lk_new_owner); if (ret == NULL) { list_add(&lo->lo_owner.so_strhash, &clp->cl_ownerstr_hashtbl[strhashval]); ret = lo; } else - nfs4_free_lockowner(&lo->lo_owner); + nfs4_free_stateowner(&lo->lo_owner); + spin_unlock(&clp->cl_lock); return ret; } @@ -5298,8 +5320,8 @@ find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi, static int check_lock_length(u64 offset, u64 length) { - return ((length == 0) || ((length != NFS4_MAX_UINT64) && - LOFF_OVERFLOW(offset, length))); + return ((length == 0) || ((length != NFS4_MAX_UINT64) && + (length > ~offset))); } static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access) @@ -5328,9 +5350,9 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_lockowner *lo; unsigned int strhashval; - lo = find_lockowner_str(&cl->cl_clientid, &lock->v.new.owner, cl); + lo = find_lockowner_str(cl, &lock->lk_new_owner); if (!lo) { - strhashval = ownerstr_hashval(&lock->v.new.owner); + strhashval = ownerstr_hashval(&lock->lk_new_owner); lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock); if (lo == NULL) return nfserr_jukebox; @@ -5391,7 +5413,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (lock->lk_is_new) { if (nfsd4_has_session(cstate)) /* See rfc 5661 18.10.3: given clientid is ignored: */ - memcpy(&lock->v.new.clientid, + memcpy(&lock->lk_new_clientid, &cstate->session->se_client->cl_clientid, sizeof(clientid_t)); @@ -5409,7 +5431,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, open_sop = openowner(open_stp->st_stateowner); status = nfserr_bad_stateid; if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid, - &lock->v.new.clientid)) + &lock->lk_new_clientid)) goto out; status = lookup_or_create_lock_state(cstate, open_stp, lock, &lock_stp, &new); @@ -5603,8 +5625,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - lo = find_lockowner_str(&lockt->lt_clientid, &lockt->lt_owner, - cstate->clp); + lo = find_lockowner_str(cstate->clp, &lockt->lt_owner); if (lo) file_lock->fl_owner = (fl_owner_t)lo; file_lock->fl_pid = current->tgid; @@ -6019,7 +6040,7 @@ nfsd_inject_add_lock_to_list(struct nfs4_ol_stateid *lst, static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, struct list_head *collect, - void (*func)(struct nfs4_ol_stateid *)) + bool (*func)(struct nfs4_ol_stateid *)) { struct nfs4_openowner *oop; struct nfs4_ol_stateid *stp, *st_next; @@ -6033,9 +6054,9 @@ static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, list_for_each_entry_safe(lst, lst_next, &stp->st_locks, st_locks) { if (func) { - func(lst); - nfsd_inject_add_lock_to_list(lst, - collect); + if (func(lst)) + nfsd_inject_add_lock_to_list(lst, + collect); } ++count; /* @@ -6305,7 +6326,7 @@ static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max, continue; atomic_inc(&clp->cl_refcount); - unhash_delegation_locked(dp); + WARN_ON(!unhash_delegation_locked(dp)); list_add(&dp->dl_recall_lru, victims); } ++count; @@ -6584,6 +6605,7 @@ nfs4_state_start_net(struct net *net) return ret; nn->boot_time = get_seconds(); nn->grace_ended = false; + nn->nfsd4_manager.block_opens = true; locks_start_grace(net, &nn->nfsd4_manager); nfsd4_client_tracking_init(net); printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n", @@ -6602,7 +6624,7 @@ nfs4_state_start(void) ret = set_callback_cred(); if (ret) return -ENOMEM; - laundry_wq = create_singlethread_workqueue("nfsd4"); + laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4"); if (laundry_wq == NULL) { ret = -ENOMEM; goto out_recovery; @@ -6635,7 +6657,7 @@ nfs4_state_shutdown_net(struct net *net) spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - unhash_delegation_locked(dp); + WARN_ON(!unhash_delegation_locked(dp)); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 54633858733a..51c9e9ca39a4 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2140,9 +2140,31 @@ nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp, return nfsd4_encode_user(xdr, rqstp, ace->who_uid); } +static inline __be32 +nfsd4_encode_layout_type(struct xdr_stream *xdr, enum pnfs_layouttype layout_type) +{ + __be32 *p; + + if (layout_type) { + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(layout_type); + } else { + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(0); + } + + return 0; +} + #define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \ FATTR4_WORD0_RDATTR_ERROR) #define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID +#define WORD2_ABSENT_FS_ATTRS 0 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL static inline __be32 @@ -2171,7 +2193,7 @@ nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp, { return 0; } #endif -static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err) +static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *bmval2, u32 *rdattr_err) { /* As per referral draft: */ if (*bmval0 & ~WORD0_ABSENT_FS_ATTRS || @@ -2184,6 +2206,7 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err) } *bmval0 &= WORD0_ABSENT_FS_ATTRS; *bmval1 &= WORD1_ABSENT_FS_ATTRS; + *bmval2 &= WORD2_ABSENT_FS_ATTRS; return 0; } @@ -2203,6 +2226,39 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat) return err; } +static __be32 +nfsd4_encode_bitmap(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2) +{ + __be32 *p; + + if (bmval2) { + p = xdr_reserve_space(xdr, 16); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(bmval0); + *p++ = cpu_to_be32(bmval1); + *p++ = cpu_to_be32(bmval2); + } else if (bmval1) { + p = xdr_reserve_space(xdr, 12); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(bmval0); + *p++ = cpu_to_be32(bmval1); + } else { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(bmval0); + } + + return 0; +out_resource: + return nfserr_resource; +} + /* * Note: @fhp can be NULL; in this case, we might have to compose the filehandle * ourselves. @@ -2246,8 +2302,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion)); if (exp->ex_fslocs.migrated) { - BUG_ON(bmval[2]); - status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err); + status = fattr_handle_absent_fs(&bmval0, &bmval1, &bmval2, &rdattr_err); if (status) goto out; } @@ -2286,8 +2341,8 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, } #ifdef CONFIG_NFSD_V4_SECURITY_LABEL - if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) || - bmval[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { + if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) || + bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { err = security_inode_getsecctx(d_inode(dentry), &context, &contextlen); contextsupport = (err == 0); @@ -2300,28 +2355,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, } #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ - if (bmval2) { - p = xdr_reserve_space(xdr, 16); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(3); - *p++ = cpu_to_be32(bmval0); - *p++ = cpu_to_be32(bmval1); - *p++ = cpu_to_be32(bmval2); - } else if (bmval1) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(bmval0); - *p++ = cpu_to_be32(bmval1); - } else { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - *p++ = cpu_to_be32(bmval0); - } + status = nfsd4_encode_bitmap(xdr, bmval0, bmval1, bmval2); + if (status) + goto out; attrlen_offset = xdr->buf->len; p = xdr_reserve_space(xdr, 4); @@ -2674,6 +2710,9 @@ out_acl: *p++ = cpu_to_be32(stat.mtime.tv_nsec); } if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { + struct kstat parent_stat; + u64 ino = stat.ino; + p = xdr_reserve_space(xdr, 8); if (!p) goto out_resource; @@ -2682,25 +2721,25 @@ out_acl: * and this is the root of a cross-mounted filesystem. */ if (ignore_crossmnt == 0 && - dentry == exp->ex_path.mnt->mnt_root) - get_parent_attributes(exp, &stat); - p = xdr_encode_hyper(p, stat.ino); + dentry == exp->ex_path.mnt->mnt_root) { + err = get_parent_attributes(exp, &parent_stat); + if (err) + goto out_nfserr; + ino = parent_stat.ino; + } + p = xdr_encode_hyper(p, ino); } #ifdef CONFIG_NFSD_PNFS - if ((bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) || - (bmval2 & FATTR4_WORD2_LAYOUT_TYPES)) { - if (exp->ex_layout_type) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - *p++ = cpu_to_be32(exp->ex_layout_type); - } else { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(0); - } + if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { + status = nfsd4_encode_layout_type(xdr, exp->ex_layout_type); + if (status) + goto out; + } + + if (bmval2 & FATTR4_WORD2_LAYOUT_TYPES) { + status = nfsd4_encode_layout_type(xdr, exp->ex_layout_type); + if (status) + goto out; } if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) { @@ -2710,21 +2749,20 @@ out_acl: *p++ = cpu_to_be32(stat.blksize); } #endif /* CONFIG_NFSD_PNFS */ + if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { + status = nfsd4_encode_bitmap(xdr, NFSD_SUPPATTR_EXCLCREAT_WORD0, + NFSD_SUPPATTR_EXCLCREAT_WORD1, + NFSD_SUPPATTR_EXCLCREAT_WORD2); + if (status) + goto out; + } + if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { status = nfsd4_encode_security_label(xdr, rqstp, context, contextlen); if (status) goto out; } - if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { - p = xdr_reserve_space(xdr, 16); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(3); - *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD0); - *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD1); - *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD2); - } attrlen = htonl(xdr->buf->len - attrlen_offset - 4); write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, 4); @@ -3043,13 +3081,12 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ __be32 *p; if (!nfserr) { - p = xdr_reserve_space(xdr, 32); + p = xdr_reserve_space(xdr, 20); if (!p) return nfserr_resource; - p = encode_cinfo(p, &create->cr_cinfo); - *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(create->cr_bmval[0]); - *p++ = cpu_to_be32(create->cr_bmval[1]); + encode_cinfo(p, &create->cr_cinfo); + nfserr = nfsd4_encode_bitmap(xdr, create->cr_bmval[0], + create->cr_bmval[1], create->cr_bmval[2]); } return nfserr; } @@ -3189,16 +3226,22 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid); if (nfserr) goto out; - p = xdr_reserve_space(xdr, 40); + p = xdr_reserve_space(xdr, 24); if (!p) return nfserr_resource; p = encode_cinfo(p, &open->op_cinfo); *p++ = cpu_to_be32(open->op_rflags); - *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(open->op_bmval[0]); - *p++ = cpu_to_be32(open->op_bmval[1]); - *p++ = cpu_to_be32(open->op_delegate_type); + nfserr = nfsd4_encode_bitmap(xdr, open->op_bmval[0], open->op_bmval[1], + open->op_bmval[2]); + if (nfserr) + goto out; + + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + + *p++ = cpu_to_be32(open->op_delegate_type); switch (open->op_delegate_type) { case NFS4_OPEN_DELEGATE_NONE: break; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 9277cc91c21b..ad4e2377dd63 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -391,6 +391,14 @@ static int nfsd_get_default_max_blksize(void) return ret; } +static struct svc_serv_ops nfsd_thread_sv_ops = { + .svo_shutdown = nfsd_last_thread, + .svo_function = nfsd, + .svo_enqueue_xprt = svc_xprt_do_enqueue, + .svo_setup = svc_set_num_threads, + .svo_module = THIS_MODULE, +}; + int nfsd_create_serv(struct net *net) { int error; @@ -405,7 +413,7 @@ int nfsd_create_serv(struct net *net) nfsd_max_blksize = nfsd_get_default_max_blksize(); nfsd_reset_versions(); nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, - nfsd_last_thread, nfsd, THIS_MODULE); + &nfsd_thread_sv_ops); if (nn->nfsd_serv == NULL) return -ENOMEM; @@ -500,8 +508,8 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) /* apply the new numbers */ svc_get(nn->nfsd_serv); for (i = 0; i < n; i++) { - err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i], - nthreads[i]); + err = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv, + &nn->nfsd_serv->sv_pools[i], nthreads[i]); if (err) break; } @@ -540,7 +548,8 @@ nfsd_svc(int nrservs, struct net *net) error = nfsd_startup_net(nrservs, net); if (error) goto out_destroy; - error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs); + error = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv, + NULL, nrservs); if (error) goto out_shutdown; /* We are holding a reference to nn->nfsd_serv which diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 4874ce515fc1..583ffc13cae2 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -67,8 +67,8 @@ struct nfsd4_callback { struct rpc_message cb_msg; struct nfsd4_callback_ops *cb_ops; struct work_struct cb_work; + int cb_seq_status; int cb_status; - bool cb_update_seq_nr; bool cb_need_restart; }; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index b5e077a6e7d4..45c04979e7b3 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1249,12 +1249,6 @@ out_nfserr: #ifdef CONFIG_NFSD_V3 -static inline int nfsd_create_is_exclusive(int createmode) -{ - return createmode == NFS3_CREATE_EXCLUSIVE - || createmode == NFS4_CREATE_EXCLUSIVE4_1; -} - /* * NFSv3 and NFSv4 version of nfsd_create */ diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 5be875e3e638..fee2451ae248 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -131,4 +131,10 @@ static inline __be32 fh_getattr(struct svc_fh *fh, struct kstat *stat) return nfserrno(vfs_getattr(&p, stat)); } +static inline int nfsd_create_is_exclusive(int createmode) +{ + return createmode == NFS3_CREATE_EXCLUSIVE + || createmode == NFS4_CREATE_EXCLUSIVE4_1; +} + #endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 0ee0bed3649b..6b8b92b19cec 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -61,11 +61,6 @@ static inline void nilfs_put_page(struct page *page) page_cache_release(page); } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - /* * Return the offset into page `page_nr' of the last valid * byte in that page, plus one. diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 258d9fe2521a..4a73d6dffabf 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -307,31 +307,13 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping, static ssize_t nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = file->f_mapping->host; - size_t count = iov_iter_count(iter); - ssize_t size; + struct inode *inode = file_inode(iocb->ki_filp); if (iov_iter_rw(iter) == WRITE) return 0; /* Needs synchronization with the cleaner */ - size = blockdev_direct_IO(iocb, inode, iter, offset, nilfs_get_block); - - /* - * In case of error extending write may have instantiated a few - * blocks outside i_size. Trim these off again. - */ - if (unlikely(iov_iter_rw(iter) == WRITE && size < 0)) { - loff_t isize = i_size_read(inode); - loff_t end = offset + count; - - if (end > isize) - nilfs_write_failed(mapping, end); - } - - return size; + return blockdev_direct_IO(iocb, inode, iter, offset, nilfs_get_block); } const struct address_space_operations nilfs_aops = { diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 9a20e513d7eb..aba43811d6ef 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -1369,7 +1369,6 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case NILFS_IOCTL_SYNC: case NILFS_IOCTL_RESIZE: case NILFS_IOCTL_SET_ALLOC_RANGE: - case FITRIM: break; default: return -ENOIOCTLCMD; diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 42468e5ab3e7..f63620ce3892 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -338,12 +338,11 @@ void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed) /* * BIO operations */ -static void nilfs_end_bio_write(struct bio *bio, int err) +static void nilfs_end_bio_write(struct bio *bio) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct nilfs_segment_buffer *segbuf = bio->bi_private; - if (!uptodate) + if (bio->bi_error) atomic_inc(&segbuf->sb_err); bio_put(bio); @@ -415,7 +414,7 @@ static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf, { wi->bio = NULL; wi->rest_blocks = segbuf->sb_sum.nblocks; - wi->max_pages = bio_get_nr_vecs(wi->nilfs->ns_bdev); + wi->max_pages = BIO_MAX_PAGES; wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); wi->start = wi->end = 0; wi->blocknr = segbuf->sb_pseg_start; diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 44523f4a6084..6faaf710e563 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -154,6 +154,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) struct dnotify_struct *dn; struct dnotify_struct **prev; struct inode *inode; + bool free = false; inode = file_inode(filp); if (!S_ISDIR(inode->i_mode)) @@ -182,11 +183,15 @@ void dnotify_flush(struct file *filp, fl_owner_t id) /* nothing else could have found us thanks to the dnotify_groups mark_mutex */ - if (dn_mark->dn == NULL) - fsnotify_destroy_mark_locked(fsn_mark, dnotify_group); + if (dn_mark->dn == NULL) { + fsnotify_detach_mark(fsn_mark); + free = true; + } mutex_unlock(&dnotify_group->mark_mutex); + if (free) + fsnotify_free_mark(fsn_mark); fsnotify_put_mark(fsn_mark); } @@ -362,9 +367,10 @@ out: spin_unlock(&fsn_mark->lock); if (destroy) - fsnotify_destroy_mark_locked(fsn_mark, dnotify_group); - + fsnotify_detach_mark(fsn_mark); mutex_unlock(&dnotify_group->mark_mutex); + if (destroy) + fsnotify_free_mark(fsn_mark); fsnotify_put_mark(fsn_mark); out_err: if (new_fsn_mark) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index cf275500a665..8e8e6bcd1d43 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -529,8 +529,10 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); if (destroy_mark) - fsnotify_destroy_mark_locked(fsn_mark, group); + fsnotify_detach_mark(fsn_mark); mutex_unlock(&group->mark_mutex); + if (destroy_mark) + fsnotify_free_mark(fsn_mark); fsnotify_put_mark(fsn_mark); if (removed & real_mount(mnt)->mnt_fsnotify_mask) @@ -557,8 +559,10 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group, removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); if (destroy_mark) - fsnotify_destroy_mark_locked(fsn_mark, group); + fsnotify_detach_mark(fsn_mark); mutex_unlock(&group->mark_mutex); + if (destroy_mark) + fsnotify_free_mark(fsn_mark); /* matches the fsnotify_find_inode_mark() */ fsnotify_put_mark(fsn_mark); diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 58b7cdb63da9..6b6f0d472ae8 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -76,7 +76,8 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) struct inotify_inode_mark *inode_mark; struct inode *inode; - if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE))) + if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE) || + !(mark->flags & FSNOTIFY_MARK_FLAG_INODE)) return; inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index dd3fb0b17be7..db39de2dd4cb 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -26,7 +26,6 @@ #include <linux/fsnotify_backend.h> #include "fsnotify.h" -#include "../mount.h" /* * Clear all of the marks on an inode when it is being evicted from core @@ -205,6 +204,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, mnt = NULL; /* + * Optimization: srcu_read_lock() has a memory barrier which can + * be expensive. It protects walking the *_fsnotify_marks lists. + * However, if we do not walk the lists, we do not have to do + * SRCU because we have no references to any objects and do not + * need SRCU to keep them "alive". + */ + if (hlist_empty(&to_tell->i_fsnotify_marks) && + (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks))) + return 0; + /* * if this is a modify event we may need to clear the ignored masks * otherwise return if neither the inode nor the vfsmount care about * this type of event. diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 13a00be516d2..b44c68a857e7 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -6,6 +6,8 @@ #include <linux/srcu.h> #include <linux/types.h> +#include "../mount.h" + /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); @@ -38,15 +40,22 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); /* inode specific destruction of a mark */ extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark); -/* Destroy all marks in the given list */ -extern void fsnotify_destroy_marks(struct list_head *to_free); /* Find mark belonging to given group in the list of marks */ extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, struct fsnotify_group *group); -/* run the list of all marks associated with inode and flag them to be freed */ -extern void fsnotify_clear_marks_by_inode(struct inode *inode); -/* run the list of all marks associated with vfsmount and flag them to be freed */ -extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt); +/* Destroy all marks in the given list protected by 'lock' */ +extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock); +/* run the list of all marks associated with inode and destroy them */ +static inline void fsnotify_clear_marks_by_inode(struct inode *inode) +{ + fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock); +} +/* run the list of all marks associated with vfsmount and destroy them */ +static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) +{ + fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks, + &mnt->mnt_root->d_lock); +} /* * update the dentry->d_flags of all of inode's children to indicate if inode cares * about events that happen to its children. diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 3daf513ee99e..e785fd954c30 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -65,26 +65,6 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) } /* - * Given an inode, destroy all of the marks associated with that inode. - */ -void fsnotify_clear_marks_by_inode(struct inode *inode) -{ - struct fsnotify_mark *mark; - struct hlist_node *n; - LIST_HEAD(free_list); - - spin_lock(&inode->i_lock); - hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) { - list_add(&mark->free_list, &free_list); - hlist_del_init_rcu(&mark->obj_list); - fsnotify_get_mark(mark); - } - spin_unlock(&inode->i_lock); - - fsnotify_destroy_marks(&free_list); -} - -/* * Given a group clear all of the inode marks associated with that group. */ void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) @@ -163,17 +143,17 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, /** * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. - * @list: list of inodes being unmounted (sb->s_inodes) + * @sb: superblock being unmounted. * * Called during unmount with no locks held, so needs to be safe against - * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block. + * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. */ -void fsnotify_unmount_inodes(struct list_head *list) +void fsnotify_unmount_inodes(struct super_block *sb) { struct inode *inode, *next_i, *need_iput = NULL; - spin_lock(&inode_sb_list_lock); - list_for_each_entry_safe(inode, next_i, list, i_sb_list) { + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list) { struct inode *need_iput_tmp; /* @@ -209,7 +189,7 @@ void fsnotify_unmount_inodes(struct list_head *list) spin_unlock(&inode->i_lock); /* In case the dropping of a reference would nuke next_i. */ - while (&next_i->i_sb_list != list) { + while (&next_i->i_sb_list != &sb->s_inodes) { spin_lock(&next_i->i_lock); if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) && atomic_read(&next_i->i_count)) { @@ -224,12 +204,12 @@ void fsnotify_unmount_inodes(struct list_head *list) } /* - * We can safely drop inode_sb_list_lock here because either + * We can safely drop s_inode_list_lock here because either * we actually hold references on both inode and next_i or * end of list. Also no new inodes will be added since the * umount has begun. */ - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); if (need_iput_tmp) iput(need_iput_tmp); @@ -241,7 +221,7 @@ void fsnotify_unmount_inodes(struct list_head *list) iput(inode); - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 450648697433..5b1e2a497e51 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -26,7 +26,7 @@ #include <linux/fs.h> /* struct inode */ #include <linux/fsnotify_backend.h> #include <linux/idr.h> -#include <linux/init.h> /* module_init */ +#include <linux/init.h> /* fs_initcall */ #include <linux/inotify.h> #include <linux/kernel.h> /* roundup() */ #include <linux/namei.h> /* LOOKUP_FOLLOW */ @@ -812,4 +812,4 @@ static int __init inotify_user_setup(void) return 0; } -module_init(inotify_user_setup); +fs_initcall(inotify_user_setup); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 92e48c70f0f0..fc0df4442f7b 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -122,26 +122,27 @@ u32 fsnotify_recalc_mask(struct hlist_head *head) } /* - * Any time a mark is getting freed we end up here. - * The caller had better be holding a reference to this mark so we don't actually - * do the final put under the mark->lock + * Remove mark from inode / vfsmount list, group list, drop inode reference + * if we got one. + * + * Must be called with group->mark_mutex held. */ -void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, - struct fsnotify_group *group) +void fsnotify_detach_mark(struct fsnotify_mark *mark) { struct inode *inode = NULL; + struct fsnotify_group *group = mark->group; BUG_ON(!mutex_is_locked(&group->mark_mutex)); spin_lock(&mark->lock); /* something else already called this function on this mark */ - if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { + if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { spin_unlock(&mark->lock); return; } - mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; + mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { inode = mark->inode; @@ -150,6 +151,12 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, fsnotify_destroy_vfsmount_mark(mark); else BUG(); + /* + * Note that we didn't update flags telling whether inode cares about + * what's happening with children. We update these flags from + * __fsnotify_parent() lazily when next event happens on one of our + * children. + */ list_del_init(&mark->g_list); @@ -157,18 +164,32 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) iput(inode); - /* release lock temporarily */ - mutex_unlock(&group->mark_mutex); + + atomic_dec(&group->num_marks); +} + +/* + * Free fsnotify mark. The freeing is actually happening from a kthread which + * first waits for srcu period end. Caller must have a reference to the mark + * or be protected by fsnotify_mark_srcu. + */ +void fsnotify_free_mark(struct fsnotify_mark *mark) +{ + struct fsnotify_group *group = mark->group; + + spin_lock(&mark->lock); + /* something else already called this function on this mark */ + if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { + spin_unlock(&mark->lock); + return; + } + mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; + spin_unlock(&mark->lock); spin_lock(&destroy_lock); list_add(&mark->g_list, &destroy_list); spin_unlock(&destroy_lock); wake_up(&destroy_waitq); - /* - * We don't necessarily have a ref on mark from caller so the above destroy - * may have actually freed it, unless this group provides a 'freeing_mark' - * function which must be holding a reference. - */ /* * Some groups like to know that marks are being freed. This is a @@ -177,50 +198,45 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, */ if (group->ops->freeing_mark) group->ops->freeing_mark(mark, group); - - /* - * __fsnotify_update_child_dentry_flags(inode); - * - * I really want to call that, but we can't, we have no idea if the inode - * still exists the second we drop the mark->lock. - * - * The next time an event arrive to this inode from one of it's children - * __fsnotify_parent will see that the inode doesn't care about it's - * children and will update all of these flags then. So really this - * is just a lazy update (and could be a perf win...) - */ - - atomic_dec(&group->num_marks); - - mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); } void fsnotify_destroy_mark(struct fsnotify_mark *mark, struct fsnotify_group *group) { mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); - fsnotify_destroy_mark_locked(mark, group); + fsnotify_detach_mark(mark); mutex_unlock(&group->mark_mutex); + fsnotify_free_mark(mark); } -/* - * Destroy all marks in the given list. The marks must be already detached from - * the original inode / vfsmount. - */ -void fsnotify_destroy_marks(struct list_head *to_free) +void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock) { - struct fsnotify_mark *mark, *lmark; - struct fsnotify_group *group; - - list_for_each_entry_safe(mark, lmark, to_free, free_list) { - spin_lock(&mark->lock); - fsnotify_get_group(mark->group); - group = mark->group; - spin_unlock(&mark->lock); + struct fsnotify_mark *mark; - fsnotify_destroy_mark(mark, group); + while (1) { + /* + * We have to be careful since we can race with e.g. + * fsnotify_clear_marks_by_group() and once we drop 'lock', + * mark can get removed from the obj_list and destroyed. But + * we are holding mark reference so mark cannot be freed and + * calling fsnotify_destroy_mark() more than once is fine. + */ + spin_lock(lock); + if (hlist_empty(head)) { + spin_unlock(lock); + break; + } + mark = hlist_entry(head->first, struct fsnotify_mark, obj_list); + /* + * We don't update i_fsnotify_mask / mnt_fsnotify_mask here + * since inode / mount is going away anyway. So just remove + * mark from the list. + */ + hlist_del_init_rcu(&mark->obj_list); + fsnotify_get_mark(mark); + spin_unlock(lock); + fsnotify_destroy_mark(mark, mark->group); fsnotify_put_mark(mark); - fsnotify_put_group(group); } } @@ -332,7 +348,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, * inode->i_lock */ spin_lock(&mark->lock); - mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE; + mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED; fsnotify_get_group(group); mark->group = group; @@ -412,16 +428,37 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags) { struct fsnotify_mark *lmark, *mark; + LIST_HEAD(to_free); + /* + * We have to be really careful here. Anytime we drop mark_mutex, e.g. + * fsnotify_clear_marks_by_inode() can come and free marks. Even in our + * to_free list so we have to use mark_mutex even when accessing that + * list. And freeing mark requires us to drop mark_mutex. So we can + * reliably free only the first mark in the list. That's why we first + * move marks to free to to_free list in one go and then free marks in + * to_free list one by one. + */ mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { - if (mark->flags & flags) { - fsnotify_get_mark(mark); - fsnotify_destroy_mark_locked(mark, group); - fsnotify_put_mark(mark); - } + if (mark->flags & flags) + list_move(&mark->g_list, &to_free); } mutex_unlock(&group->mark_mutex); + + while (1) { + mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); + if (list_empty(&to_free)) { + mutex_unlock(&group->mark_mutex); + break; + } + mark = list_first_entry(&to_free, struct fsnotify_mark, g_list); + fsnotify_get_mark(mark); + fsnotify_detach_mark(mark); + mutex_unlock(&group->mark_mutex); + fsnotify_free_mark(mark); + fsnotify_put_mark(mark); + } } /* diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index 326b148e623c..a8fcab68faef 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -28,25 +28,6 @@ #include <linux/fsnotify_backend.h> #include "fsnotify.h" -#include "../mount.h" - -void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) -{ - struct fsnotify_mark *mark; - struct hlist_node *n; - struct mount *m = real_mount(mnt); - LIST_HEAD(free_list); - - spin_lock(&mnt->mnt_root->d_lock); - hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, obj_list) { - list_add(&mark->free_list, &free_list); - hlist_del_init_rcu(&mark->obj_list); - fsnotify_get_mark(mark); - } - spin_unlock(&mnt->mnt_root->d_lock); - - fsnotify_destroy_marks(&free_list); -} void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) { diff --git a/fs/nsfs.c b/fs/nsfs.c index 99521e7c492b..e4905fbf3396 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -4,6 +4,7 @@ #include <linux/proc_ns.h> #include <linux/magic.h> #include <linux/ktime.h> +#include <linux/seq_file.h> static struct vfsmount *nsfs_mnt; @@ -136,9 +137,18 @@ out_invalid: return ERR_PTR(-EINVAL); } +static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + const struct proc_ns_operations *ns_ops = dentry->d_fsdata; + + return seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino); +} + static const struct super_operations nsfs_ops = { .statfs = simple_statfs, .evict_inode = nsfs_evict, + .show_path = nsfs_show_path, }; static struct dentry *nsfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 2cd653670764..262561fea923 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -382,7 +382,7 @@ static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb, base_ni = ni; if (NInoAttr(ni)) base_ni = ni->ext.base_ntfs_ino; - err = file_remove_suid(file); + err = file_remove_privs(file); if (unlikely(err)) goto out; /* diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index 76b6cfb579d7..b3c3469de6cb 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -239,7 +239,7 @@ typedef struct { */ static inline ntfs_inode *NTFS_I(struct inode *inode) { - return (ntfs_inode *)list_entry(inode, big_ntfs_inode, vfs_inode); + return (ntfs_inode *)container_of(inode, big_ntfs_inode, vfs_inode); } static inline struct inode *VFS_I(ntfs_inode *ni) diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 9e1e112074fb..d1a853585b53 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -543,7 +543,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) return -EROFS; } if (!ntfs_stamp_usnjrnl(vol)) { - ntfs_error(sb, "Failed to stamp transation log " + ntfs_error(sb, "Failed to stamp transaction log " "($UsnJrnl)%s", es); NVolSetErrors(vol); return -EROFS; @@ -2204,17 +2204,12 @@ get_ctx_vol_failed: return true; #ifdef NTFS_RW iput_usnjrnl_err_out: - if (vol->usnjrnl_j_ino) - iput(vol->usnjrnl_j_ino); - if (vol->usnjrnl_max_ino) - iput(vol->usnjrnl_max_ino); - if (vol->usnjrnl_ino) - iput(vol->usnjrnl_ino); + iput(vol->usnjrnl_j_ino); + iput(vol->usnjrnl_max_ino); + iput(vol->usnjrnl_ino); iput_quota_err_out: - if (vol->quota_q_ino) - iput(vol->quota_q_ino); - if (vol->quota_ino) - iput(vol->quota_ino); + iput(vol->quota_q_ino); + iput(vol->quota_ino); iput(vol->extend_ino); #endif /* NTFS_RW */ iput_sec_err_out: @@ -2223,8 +2218,7 @@ iput_root_err_out: iput(vol->root_ino); iput_logfile_err_out: #ifdef NTFS_RW - if (vol->logfile_ino) - iput(vol->logfile_ino); + iput(vol->logfile_ino); iput_vol_err_out: #endif /* NTFS_RW */ iput(vol->vol_ino); @@ -2254,8 +2248,7 @@ iput_mftbmp_err_out: iput(vol->mftbmp_ino); iput_mirr_err_out: #ifdef NTFS_RW - if (vol->mftmirr_ino) - iput(vol->mftmirr_ino); + iput(vol->mftmirr_ino); #endif /* NTFS_RW */ return false; } diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index c58a1bcfda0f..0cdf497c91ef 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -284,7 +284,19 @@ int ocfs2_set_acl(handle_t *handle, int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL); + struct buffer_head *bh = NULL; + int status = 0; + + status = ocfs2_inode_lock(inode, &bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); + ocfs2_inode_unlock(inode, 1); + brelse(bh); + return status; } struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) @@ -292,19 +304,21 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) struct ocfs2_super *osb; struct buffer_head *di_bh = NULL; struct posix_acl *acl; - int ret = -EAGAIN; + int ret; osb = OCFS2_SB(inode->i_sb); if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return NULL; - - ret = ocfs2_read_inode_block(inode, &di_bh); - if (ret < 0) + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret < 0) { + if (ret != -ENOENT) + mlog_errno(ret); return ERR_PTR(ret); + } acl = ocfs2_get_acl_nolock(inode, type, di_bh); + ocfs2_inode_unlock(inode, 0); brelse(di_bh); - return acl; } diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 5997c00a1515..86181d6526dc 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -908,32 +908,30 @@ static int ocfs2_validate_extent_block(struct super_block *sb, */ if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - ocfs2_error(sb, - "Extent block #%llu has bad signature %.*s", - (unsigned long long)bh->b_blocknr, 7, - eb->h_signature); - return -EINVAL; + rc = ocfs2_error(sb, + "Extent block #%llu has bad signature %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + eb->h_signature); + goto bail; } if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) { - ocfs2_error(sb, - "Extent block #%llu has an invalid h_blkno " - "of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(eb->h_blkno)); - return -EINVAL; + rc = ocfs2_error(sb, + "Extent block #%llu has an invalid h_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(eb->h_blkno)); + goto bail; } if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Extent block #%llu has an invalid " - "h_fs_generation of #%u", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(eb->h_fs_generation)); - return -EINVAL; + rc = ocfs2_error(sb, + "Extent block #%llu has an invalid h_fs_generation of #%u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(eb->h_fs_generation)); + goto bail; } - - return 0; +bail: + return rc; } int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, @@ -1446,8 +1444,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, while(le16_to_cpu(el->l_tree_depth) > 1) { if (le16_to_cpu(el->l_next_free_rec) == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has empty " - "extent list (next_free_rec == 0)", + "Owner %llu has empty extent list (next_free_rec == 0)\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); status = -EIO; goto bail; @@ -1456,9 +1453,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, blkno = le64_to_cpu(el->l_recs[i].e_blkno); if (!blkno) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has extent " - "list where extent # %d has no physical " - "block start", + "Owner %llu has extent list where extent # %d has no physical block start\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i); status = -EIO; goto bail; @@ -1788,8 +1783,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, while (el->l_tree_depth) { if (le16_to_cpu(el->l_next_free_rec) == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), - "Owner %llu has empty extent list at " - "depth %u\n", + "Owner %llu has empty extent list at depth %u\n", (unsigned long long)ocfs2_metadata_cache_owner(ci), le16_to_cpu(el->l_tree_depth)); ret = -EROFS; @@ -1814,8 +1808,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, blkno = le64_to_cpu(el->l_recs[i].e_blkno); if (blkno == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), - "Owner %llu has bad blkno in extent list " - "at depth %u (index %d)\n", + "Owner %llu has bad blkno in extent list at depth %u (index %d)\n", (unsigned long long)ocfs2_metadata_cache_owner(ci), le16_to_cpu(el->l_tree_depth), i); ret = -EROFS; @@ -1836,8 +1829,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, if (le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), - "Owner %llu has bad count in extent list " - "at block %llu (next free=%u, count=%u)\n", + "Owner %llu has bad count in extent list at block %llu (next free=%u, count=%u)\n", (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)bh->b_blocknr, le16_to_cpu(el->l_next_free_rec), @@ -2116,8 +2108,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle, if (left_el->l_next_free_rec != left_el->l_count) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Inode %llu has non-full interior leaf node %llu" - "(next free = %u)", + "Inode %llu has non-full interior leaf node %llu (next free = %u)\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)left_leaf_bh->b_blocknr, le16_to_cpu(left_el->l_next_free_rec)); @@ -2256,8 +2247,7 @@ int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, * If we got here, we never found a valid node where * the tree indicated one should be. */ - ocfs2_error(sb, - "Invalid extent tree at extent block %llu\n", + ocfs2_error(sb, "Invalid extent tree at extent block %llu\n", (unsigned long long)blkno); ret = -EROFS; goto out; @@ -2872,8 +2862,7 @@ int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, * If we got here, we never found a valid node where * the tree indicated one should be. */ - ocfs2_error(sb, - "Invalid extent tree at extent block %llu\n", + ocfs2_error(sb, "Invalid extent tree at extent block %llu\n", (unsigned long long)blkno); ret = -EROFS; goto out; @@ -3131,6 +3120,30 @@ out: return ret; } +static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + handle_t *handle; + int ret; + int credits = path->p_tree_depth * 2 + 1; + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + return ret; + } + + ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc); + if (ret) + mlog_errno(ret); + + ocfs2_commit_trans(osb, handle); + return ret; +} + /* * Left rotation of btree records. * @@ -3200,7 +3213,7 @@ rightmost_no_delete: if (le16_to_cpu(el->l_next_free_rec) == 0) { ret = -EIO; ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has empty extent block at %llu", + "Owner %llu has empty extent block at %llu\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)le64_to_cpu(eb->h_blkno)); goto out; @@ -3930,7 +3943,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle, next_free = le16_to_cpu(el->l_next_free_rec); if (next_free == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has a bad extent list", + "Owner %llu has a bad extent list\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); ret = -EIO; return; @@ -4355,10 +4368,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, bh = path_leaf_bh(left_path); eb = (struct ocfs2_extent_block *)bh->b_data; ocfs2_error(sb, - "Extent block #%llu has an " - "invalid l_next_free_rec of " - "%d. It should have " - "matched the l_count of %d", + "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n", (unsigned long long)le64_to_cpu(eb->h_blkno), le16_to_cpu(new_el->l_next_free_rec), le16_to_cpu(new_el->l_count)); @@ -4413,8 +4423,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, bh = path_leaf_bh(right_path); eb = (struct ocfs2_extent_block *)bh->b_data; ocfs2_error(sb, - "Extent block #%llu has an " - "invalid l_next_free_rec of %d", + "Extent block #%llu has an invalid l_next_free_rec of %d\n", (unsigned long long)le64_to_cpu(eb->h_blkno), le16_to_cpu(new_el->l_next_free_rec)); status = -EINVAL; @@ -4970,10 +4979,9 @@ leftright: split_index = ocfs2_search_extent_list(el, cpos); if (split_index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has an extent at cpos %u " - "which can no longer be found.\n", - (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), - cpos); + "Owner %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); ret = -EROFS; goto out; } @@ -5158,10 +5166,9 @@ int ocfs2_change_extent_flag(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { ocfs2_error(sb, - "Owner %llu has an extent at cpos %u which can no " - "longer be found.\n", - (unsigned long long) - ocfs2_metadata_cache_owner(et->et_ci), cpos); + "Owner %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); ret = -EROFS; goto out; } @@ -5228,9 +5235,7 @@ int ocfs2_mark_extent_written(struct inode *inode, cpos, len, phys); if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " - "that are being written to, but the feature bit " - "is not set in the super block.", + ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); ret = -EROFS; goto out; @@ -5514,8 +5519,7 @@ int ocfs2_remove_extent(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has an extent at cpos %u which can no " - "longer be found.\n", + "Owner %llu has an extent at cpos %u which can no longer be found\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos); ret = -EROFS; @@ -5580,7 +5584,7 @@ int ocfs2_remove_extent(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu: split at cpos %u lost record.", + "Owner %llu: split at cpos %u lost record\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos); ret = -EROFS; @@ -5596,8 +5600,7 @@ int ocfs2_remove_extent(handle_t *handle, ocfs2_rec_clusters(el, rec); if (rec_range != trunc_range) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu: error after split at cpos %u" - "trunc len %u, existing record is (%u,%u)", + "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos, len, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); @@ -6175,7 +6178,7 @@ bail: iput(tl_inode); brelse(tl_bh); - if (status < 0 && (*tl_copy)) { + if (status < 0) { kfree(*tl_copy); *tl_copy = NULL; mlog_errno(status); @@ -7108,15 +7111,23 @@ start: * to check it up here before changing the tree. */ if (root_el->l_tree_depth && rec->e_int_clusters == 0) { - ocfs2_error(inode->i_sb, "Inode %lu has an empty " + mlog(ML_ERROR, "Inode %lu has an empty " "extent record, depth %u\n", inode->i_ino, le16_to_cpu(root_el->l_tree_depth)); - status = -EROFS; - goto bail; + status = ocfs2_remove_rightmost_empty_extent(osb, + &et, path, &dealloc); + if (status) { + mlog_errno(status); + goto bail; + } + + ocfs2_reinit_path(path, 1); + goto start; + } else { + trunc_cpos = le32_to_cpu(rec->e_cpos); + trunc_len = 0; + blkno = 0; } - trunc_cpos = le32_to_cpu(rec->e_cpos); - trunc_len = 0; - blkno = 0; } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) { /* * Truncate entire record. @@ -7204,8 +7215,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || !ocfs2_supports_inline_data(osb)) { ocfs2_error(inode->i_sb, - "Inline data flags for inode %llu don't agree! " - "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n", + "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, le16_to_cpu(di->i_dyn_features), OCFS2_I(inode)->ip_dyn_features, diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 1a35c6139656..64b11d90eca6 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) { - ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag", + ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); return -EROFS; } @@ -237,7 +237,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, if (size > PAGE_CACHE_SIZE || size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { ocfs2_error(inode->i_sb, - "Inode %llu has with inline data has bad size: %Lu", + "Inode %llu has with inline data has bad size: %Lu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)size); return -EROFS; @@ -533,10 +533,14 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); + down_read(&OCFS2_I(inode)->ip_alloc_sem); + /* This figures out the size of the next contiguous block, and * our logical offset */ ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &contig_blocks, &ext_flags); + up_read(&OCFS2_I(inode)->ip_alloc_sem); + if (ret) { mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", (unsigned long long)iblock); @@ -557,6 +561,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, alloc_locked = 1; + down_write(&OCFS2_I(inode)->ip_alloc_sem); + /* fill hole, allocate blocks can't be larger than the size * of the hole */ clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); @@ -569,6 +575,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, ret = ocfs2_extend_allocation(inode, cpos, clusters_to_alloc, 0); if (ret < 0) { + up_write(&OCFS2_I(inode)->ip_alloc_sem); mlog_errno(ret); goto bail; } @@ -576,11 +583,13 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &contig_blocks, &ext_flags); if (ret < 0) { + up_write(&OCFS2_I(inode)->ip_alloc_sem); mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", (unsigned long long)iblock); ret = -EIO; goto bail; } + up_write(&OCFS2_I(inode)->ip_alloc_sem); } /* @@ -627,10 +636,13 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); } - ocfs2_iocb_clear_rw_locked(iocb); + /* Let rw unlock to be done later to protect append direct io write */ + if (offset + bytes <= i_size_read(inode)) { + ocfs2_iocb_clear_rw_locked(iocb); - level = ocfs2_iocb_rw_locked_level(iocb); - ocfs2_rw_unlock(inode, level); + level = ocfs2_iocb_rw_locked_level(iocb); + ocfs2_rw_unlock(inode, level); + } } static int ocfs2_releasepage(struct page *page, gfp_t wait) @@ -685,7 +697,7 @@ static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb, if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { u64 s = i_size_read(inode); - sector_t sector = (p_cpos << (osb->s_clustersize_bits - 9)) + + sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) + (do_div(s, osb->s_clustersize) >> 9); ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector, @@ -832,12 +844,17 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, /* zeroing out the previously allocated cluster tail * that but not zeroed */ - if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { + down_read(&OCFS2_I(inode)->ip_alloc_sem); ret = ocfs2_direct_IO_zero_extend(osb, inode, offset, zero_len_tail, cluster_align_tail); - else + up_read(&OCFS2_I(inode)->ip_alloc_sem); + } else { + down_write(&OCFS2_I(inode)->ip_alloc_sem); ret = ocfs2_direct_IO_extend_no_holes(osb, inode, offset); + up_write(&OCFS2_I(inode)->ip_alloc_sem); + } if (ret < 0) { mlog_errno(ret); ocfs2_inode_unlock(inode, 1); @@ -857,7 +874,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, offset, ocfs2_direct_IO_get_blocks, ocfs2_dio_end_io, NULL, 0); - if (unlikely(written < 0)) { + /* overwrite aio may return -EIOCBQUEUED, and it is not an error */ + if ((written < 0) && (written != -EIOCBQUEUED)) { loff_t i_size = i_size_read(inode); if (offset + count > i_size) { @@ -876,12 +894,14 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, ocfs2_inode_unlock(inode, 1); brelse(di_bh); + di_bh = NULL; goto clean_orphan; } } ocfs2_inode_unlock(inode, 1); brelse(di_bh); + di_bh = NULL; ret = jbd2_journal_force_commit(journal); if (ret < 0) @@ -910,7 +930,7 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)); ret = blkdev_issue_zeroout(osb->sb->s_bdev, - p_cpos << (osb->s_clustersize_bits - 9), + (u64)p_cpos << (osb->s_clustersize_bits - 9), zero_len_head >> 9, GFP_NOFS, false); if (ret < 0) mlog_errno(ret); @@ -936,10 +956,12 @@ clean_orphan: if (tmp_ret < 0) { ret = tmp_ret; mlog_errno(ret); + brelse(di_bh); goto out; } ocfs2_inode_unlock(inode, 1); + brelse(di_bh); tmp_ret = jbd2_journal_force_commit(journal); if (tmp_ret < 0) { @@ -2185,10 +2207,7 @@ try_again: if (ret) goto out_commit; } - /* - * We don't want this to fail in ocfs2_write_end(), so do it - * here. - */ + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { @@ -2345,7 +2364,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - int i; + int i, ret; unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); struct inode *inode = mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -2354,6 +2373,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping, handle_t *handle = wc->w_handle; struct page *tmppage; + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + copied = ret; + mlog_errno(ret); + goto out; + } + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { ocfs2_write_end_inline(inode, pos, len, &copied, di, wc); goto out_write_size; @@ -2409,6 +2436,7 @@ out_write_size: ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, wc->w_di_bh); +out: /* unlock pages before dealloc since it needs acquiring j_trans_barrier * lock, or it will cause a deadlock since journal commit threads holds * this lock and will ask for the page lock when flushing the data. diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 1edcb141f639..fe50ded1b4ce 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -316,6 +316,12 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, bh = bhs[i]; if (!(flags & OCFS2_BH_READAHEAD)) { + if (status) { + /* Clear the rest of the buffers on error */ + put_bh(bh); + bhs[i] = NULL; + continue; + } /* We know this can't have changed as we hold the * owner sem. Avoid doing any work on the bh if the * journal has it. */ diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 16eff45727ee..fa15debcc02b 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -36,7 +36,7 @@ #include <linux/debugfs.h> #include <linux/slab.h> #include <linux/bitmap.h> - +#include <linux/ktime.h> #include "heartbeat.h" #include "tcp.h" #include "nodemanager.h" @@ -372,14 +372,13 @@ static void o2hb_wait_on_io(struct o2hb_region *reg, wait_for_completion(&wc->wc_io_complete); } -static void o2hb_bio_end_io(struct bio *bio, - int error) +static void o2hb_bio_end_io(struct bio *bio) { struct o2hb_bio_wait_ctxt *wc = bio->bi_private; - if (error) { - mlog(ML_ERROR, "IO Error %d\n", error); - wc->wc_error = error; + if (bio->bi_error) { + mlog(ML_ERROR, "IO Error %d\n", bio->bi_error); + wc->wc_error = bio->bi_error; } o2hb_bio_wait_dec(wc, 1); @@ -1061,37 +1060,6 @@ bail: return ret; } -/* Subtract b from a, storing the result in a. a *must* have a larger - * value than b. */ -static void o2hb_tv_subtract(struct timeval *a, - struct timeval *b) -{ - /* just return 0 when a is after b */ - if (a->tv_sec < b->tv_sec || - (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) { - a->tv_sec = 0; - a->tv_usec = 0; - return; - } - - a->tv_sec -= b->tv_sec; - a->tv_usec -= b->tv_usec; - while ( a->tv_usec < 0 ) { - a->tv_sec--; - a->tv_usec += 1000000; - } -} - -static unsigned int o2hb_elapsed_msecs(struct timeval *start, - struct timeval *end) -{ - struct timeval res = *end; - - o2hb_tv_subtract(&res, start); - - return res.tv_sec * 1000 + res.tv_usec / 1000; -} - /* * we ride the region ref that the region dir holds. before the region * dir is removed and drops it ref it will wait to tear down this @@ -1102,7 +1070,7 @@ static int o2hb_thread(void *data) int i, ret; struct o2hb_region *reg = data; struct o2hb_bio_wait_ctxt write_wc; - struct timeval before_hb, after_hb; + ktime_t before_hb, after_hb; unsigned int elapsed_msec; mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); @@ -1119,18 +1087,18 @@ static int o2hb_thread(void *data) * hr_timeout_ms between disk writes. On busy systems * this should result in a heartbeat which is less * likely to time itself out. */ - do_gettimeofday(&before_hb); + before_hb = ktime_get_real(); ret = o2hb_do_disk_heartbeat(reg); - do_gettimeofday(&after_hb); - elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); + after_hb = ktime_get_real(); + + elapsed_msec = (unsigned int) + ktime_ms_delta(after_hb, before_hb); mlog(ML_HEARTBEAT, - "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n", - before_hb.tv_sec, (unsigned long) before_hb.tv_usec, - after_hb.tv_sec, (unsigned long) after_hb.tv_usec, - elapsed_msec, ret); + "start = %lld, end = %lld, msec = %u, ret = %d\n", + before_hb.tv64, after_hb.tv64, elapsed_msec, ret); if (!kthread_should_stop() && elapsed_msec < reg->hr_timeout_ms) { @@ -1620,17 +1588,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg) struct o2hb_disk_slot *slot; reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL); - if (reg->hr_tmp_block == NULL) { - mlog_errno(-ENOMEM); + if (reg->hr_tmp_block == NULL) return -ENOMEM; - } reg->hr_slots = kcalloc(reg->hr_blocks, sizeof(struct o2hb_disk_slot), GFP_KERNEL); - if (reg->hr_slots == NULL) { - mlog_errno(-ENOMEM); + if (reg->hr_slots == NULL) return -ENOMEM; - } for(i = 0; i < reg->hr_blocks; i++) { slot = ®->hr_slots[i]; @@ -1646,17 +1610,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg) reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *), GFP_KERNEL); - if (!reg->hr_slot_data) { - mlog_errno(-ENOMEM); + if (!reg->hr_slot_data) return -ENOMEM; - } for(i = 0; i < reg->hr_num_pages; i++) { page = alloc_page(GFP_KERNEL); - if (!page) { - mlog_errno(-ENOMEM); + if (!page) return -ENOMEM; - } reg->hr_slot_data[i] = page; @@ -1688,10 +1648,8 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg) struct o2hb_disk_heartbeat_block *hb_block; ret = o2hb_read_slots(reg, reg->hr_blocks); - if (ret) { - mlog_errno(ret); + if (ret) goto out; - } /* We only want to get an idea of the values initially in each * slot, so we do no verification - o2hb_check_slot will diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 02878a83f0b4..ffecf89c8c1c 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -480,33 +480,26 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh) trailer = ocfs2_trailer_from_bh(bh, dir->i_sb); if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { - rc = -EINVAL; - ocfs2_error(dir->i_sb, - "Invalid dirblock #%llu: " - "signature = %.*s\n", - (unsigned long long)bh->b_blocknr, 7, - trailer->db_signature); + rc = ocfs2_error(dir->i_sb, + "Invalid dirblock #%llu: signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + trailer->db_signature); goto out; } if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) { - rc = -EINVAL; - ocfs2_error(dir->i_sb, - "Directory block #%llu has an invalid " - "db_blkno of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(trailer->db_blkno)); + rc = ocfs2_error(dir->i_sb, + "Directory block #%llu has an invalid db_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); goto out; } if (le64_to_cpu(trailer->db_parent_dinode) != OCFS2_I(dir)->ip_blkno) { - rc = -EINVAL; - ocfs2_error(dir->i_sb, - "Directory block #%llu on dinode " - "#%llu has an invalid parent_dinode " - "of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)OCFS2_I(dir)->ip_blkno, - (unsigned long long)le64_to_cpu(trailer->db_blkno)); + rc = ocfs2_error(dir->i_sb, + "Directory block #%llu on dinode #%llu has an invalid parent_dinode of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)OCFS2_I(dir)->ip_blkno, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); goto out; } out: @@ -604,14 +597,13 @@ static int ocfs2_validate_dx_root(struct super_block *sb, } if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) { - ocfs2_error(sb, - "Dir Index Root # %llu has bad signature %.*s", - (unsigned long long)le64_to_cpu(dx_root->dr_blkno), - 7, dx_root->dr_signature); - return -EINVAL; + ret = ocfs2_error(sb, + "Dir Index Root # %llu has bad signature %.*s\n", + (unsigned long long)le64_to_cpu(dx_root->dr_blkno), + 7, dx_root->dr_signature); } - return 0; + return ret; } static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di, @@ -648,12 +640,11 @@ static int ocfs2_validate_dx_leaf(struct super_block *sb, } if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) { - ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s", - 7, dx_leaf->dl_signature); - return -EROFS; + ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s\n", + 7, dx_leaf->dl_signature); } - return 0; + return ret; } static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno, @@ -812,11 +803,10 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "btree tree block %llu\n", inode->i_ino, - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in btree tree block %llu\n", + inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -832,11 +822,11 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, } if (!found) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0) in btree", inode->i_ino, - le32_to_cpu(rec->e_cpos), - ocfs2_rec_clusters(el, rec)); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has bad extent record (%u, %u, 0) in btree\n", + inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); goto out; } diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 7df88a6dd626..6918f30d02cd 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1465,39 +1465,46 @@ static int dlm_request_join(struct dlm_ctxt *dlm, if (status == -ENOPROTOOPT) { status = 0; *response = JOIN_OK_NO_MAP; - } else if (packet.code == JOIN_DISALLOW || - packet.code == JOIN_OK_NO_MAP) { - *response = packet.code; - } else if (packet.code == JOIN_PROTOCOL_MISMATCH) { - mlog(ML_NOTICE, - "This node requested DLM locking protocol %u.%u and " - "filesystem locking protocol %u.%u. At least one of " - "the protocol versions on node %d is not compatible, " - "disconnecting\n", - dlm->dlm_locking_proto.pv_major, - dlm->dlm_locking_proto.pv_minor, - dlm->fs_locking_proto.pv_major, - dlm->fs_locking_proto.pv_minor, - node); - status = -EPROTO; - *response = packet.code; - } else if (packet.code == JOIN_OK) { - *response = packet.code; - /* Use the same locking protocol as the remote node */ - dlm->dlm_locking_proto.pv_minor = packet.dlm_minor; - dlm->fs_locking_proto.pv_minor = packet.fs_minor; - mlog(0, - "Node %d responds JOIN_OK with DLM locking protocol " - "%u.%u and fs locking protocol %u.%u\n", - node, - dlm->dlm_locking_proto.pv_major, - dlm->dlm_locking_proto.pv_minor, - dlm->fs_locking_proto.pv_major, - dlm->fs_locking_proto.pv_minor); } else { - status = -EINVAL; - mlog(ML_ERROR, "invalid response %d from node %u\n", - packet.code, node); + *response = packet.code; + switch (packet.code) { + case JOIN_DISALLOW: + case JOIN_OK_NO_MAP: + break; + case JOIN_PROTOCOL_MISMATCH: + mlog(ML_NOTICE, + "This node requested DLM locking protocol %u.%u and " + "filesystem locking protocol %u.%u. At least one of " + "the protocol versions on node %d is not compatible, " + "disconnecting\n", + dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor, + dlm->fs_locking_proto.pv_major, + dlm->fs_locking_proto.pv_minor, + node); + status = -EPROTO; + break; + case JOIN_OK: + /* Use the same locking protocol as the remote node */ + dlm->dlm_locking_proto.pv_minor = packet.dlm_minor; + dlm->fs_locking_proto.pv_minor = packet.fs_minor; + mlog(0, + "Node %d responds JOIN_OK with DLM locking protocol " + "%u.%u and fs locking protocol %u.%u\n", + node, + dlm->dlm_locking_proto.pv_major, + dlm->dlm_locking_proto.pv_minor, + dlm->fs_locking_proto.pv_major, + dlm->fs_locking_proto.pv_minor); + break; + default: + status = -EINVAL; + mlog(ML_ERROR, "invalid response %d from node %u\n", + packet.code, node); + /* Reset response to JOIN_DISALLOW */ + *response = JOIN_DISALLOW; + break; + } } mlog(0, "status %d, node %d response is %d\n", status, node, @@ -1725,12 +1732,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); + o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, + dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); + status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down); if (status) goto bail; - o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, - dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up); if (status) goto bail; @@ -1845,8 +1853,6 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) sizeof(struct dlm_exit_domain), dlm_begin_exit_domain_handler, dlm, NULL, &dlm->dlm_domain_handlers); - if (status) - goto bail; bail: if (status) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index fdf4b41d0609..46b8b2bbc95a 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -498,16 +498,6 @@ static void dlm_lockres_release(struct kref *kref) mlog(0, "destroying lockres %.*s\n", res->lockname.len, res->lockname.name); - spin_lock(&dlm->track_lock); - if (!list_empty(&res->tracking)) - list_del_init(&res->tracking); - else { - mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", - res->lockname.len, res->lockname.name); - dlm_print_one_lock_resource(res); - } - spin_unlock(&dlm->track_lock); - atomic_dec(&dlm->res_cur_count); if (!hlist_unhashed(&res->hash_node) || @@ -795,8 +785,18 @@ lookup: dlm_lockres_grab_inflight_ref(dlm, tmpres); spin_unlock(&tmpres->spinlock); - if (res) + if (res) { + spin_lock(&dlm->track_lock); + if (!list_empty(&res->tracking)) + list_del_init(&res->tracking); + else + mlog(ML_ERROR, "Resource %.*s not " + "on the Tracking list\n", + res->lockname.len, + res->lockname.name); + spin_unlock(&dlm->track_lock); dlm_lockres_put(res); + } res = tmpres; goto leave; } diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index ce12e0b1a31f..d0e436dc6437 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1776,7 +1776,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, struct dlm_migratable_lockres *mres) { struct dlm_migratable_lock *ml; - struct list_head *queue, *iter; + struct list_head *queue; struct list_head *tmpq = NULL; struct dlm_lock *newlock = NULL; struct dlm_lockstatus *lksb = NULL; @@ -1821,9 +1821,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { tmpq = dlm_list_idx_to_ptr(res, j); - list_for_each(iter, tmpq) { - lock = list_entry(iter, - struct dlm_lock, list); + list_for_each_entry(lock, tmpq, list) { if (lock->ml.cookie == ml->cookie) break; lock = NULL; diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 69aac6f088ad..2e5e6d5fffe8 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -211,6 +211,16 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, __dlm_unhash_lockres(dlm, res); + spin_lock(&dlm->track_lock); + if (!list_empty(&res->tracking)) + list_del_init(&res->tracking); + else { + mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", + res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + } + spin_unlock(&dlm->track_lock); + /* lockres is not in the hash now. drop the flag and wake up * any processes waiting in dlm_get_lock_resource. */ if (!master) { diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 8b23aa2f52dd..1c91103c1333 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3035,8 +3035,6 @@ local: ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb); osb->cconn = conn; - - status = 0; bail: if (status < 0) { ocfs2_dlm_shutdown_debug(osb); @@ -4025,9 +4023,13 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb) osb->dc_work_sequence = osb->dc_wake_sequence; processed = osb->blocked_lock_count; - while (processed) { - BUG_ON(list_empty(&osb->blocked_lock_list)); - + /* + * blocked lock processing in this loop might call iput which can + * remove items off osb->blocked_lock_list. Downconvert up to + * 'processed' number of locks, but stop short if we had some + * removed in ocfs2_mark_lockres_freeing when downconverting. + */ + while (processed && !list_empty(&osb->blocked_lock_list)) { lockres = list_entry(osb->blocked_lock_list.next, struct ocfs2_lock_res, l_blocked_list); list_del_init(&lockres->l_blocked_list); diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 767370b656ca..e4719e0a3f99 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -305,8 +305,8 @@ static int ocfs2_last_eb_is_empty(struct inode *inode, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "leaf block %llu\n", inode->i_ino, + "Inode %lu has non zero tree depth in leaf block %llu\n", + inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; goto out; @@ -441,8 +441,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "leaf block %llu\n", inode->i_ino, + "Inode %lu has non zero tree depth in leaf block %llu\n", + inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; goto out; @@ -475,8 +475,9 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); if (!rec->e_blkno) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0)", inode->i_ino, + ocfs2_error(inode->i_sb, + "Inode %lu has bad extent record (%u, %u, 0)\n", + inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); ret = -EROFS; @@ -564,8 +565,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "xattr leaf block %llu\n", inode->i_ino, + "Inode %lu has non zero tree depth in xattr leaf block %llu\n", + inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; goto out; @@ -582,8 +583,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); if (!rec->e_blkno) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0) in xattr", inode->i_ino, + ocfs2_error(inode->i_sb, + "Inode %lu has bad extent record (%u, %u, 0) in xattr\n", + inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); ret = -EROFS; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 719f7f4c7a37..0e5b4515f92e 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -105,8 +105,11 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name, mode); - if (file->f_mode & FMODE_WRITE) - dquot_initialize(inode); + if (file->f_mode & FMODE_WRITE) { + status = dquot_initialize(inode); + if (status) + goto leave; + } spin_lock(&oi->ip_lock); @@ -1127,6 +1130,7 @@ out: int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) { int status = 0, size_change; + int inode_locked = 0; struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; struct ocfs2_super *osb = OCFS2_SB(sb); @@ -1155,8 +1159,11 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) if (status) return status; - if (is_quota_modification(inode, attr)) - dquot_initialize(inode); + if (is_quota_modification(inode, attr)) { + status = dquot_initialize(inode); + if (status) + return status; + } size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; if (size_change) { status = ocfs2_rw_lock(inode, 1); @@ -1172,6 +1179,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) mlog_errno(status); goto bail_unlock_rw; } + inode_locked = 1; if (size_change) { status = inode_newsize_ok(inode, attr->ia_size); @@ -1209,8 +1217,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) && OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid)); - if (!transfer_to[USRQUOTA]) { - status = -ESRCH; + if (IS_ERR(transfer_to[USRQUOTA])) { + status = PTR_ERR(transfer_to[USRQUOTA]); goto bail_unlock; } } @@ -1218,8 +1226,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) && OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid)); - if (!transfer_to[GRPQUOTA]) { - status = -ESRCH; + if (IS_ERR(transfer_to[GRPQUOTA])) { + status = PTR_ERR(transfer_to[GRPQUOTA]); goto bail_unlock; } } @@ -1252,7 +1260,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: - ocfs2_inode_unlock(inode, 1); + if (status) { + ocfs2_inode_unlock(inode, 1); + inode_locked = 0; + } bail_unlock_rw: if (size_change) ocfs2_rw_unlock(inode, 1); @@ -1268,6 +1279,8 @@ bail: if (status < 0) mlog_errno(status); } + if (inode_locked) + ocfs2_inode_unlock(inode, 1); return status; } @@ -2256,8 +2269,6 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, ssize_t written = 0; ssize_t ret; size_t count = iov_iter_count(from), orig_count; - loff_t old_size; - u32 old_clusters; struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -2265,6 +2276,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, OCFS2_MOUNT_COHERENCY_BUFFERED); int unaligned_dio = 0; int dropped_dio = 0; + int append_write = ((iocb->ki_pos + count) >= + i_size_read(inode) ? 1 : 0); trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -2284,8 +2297,9 @@ relock: /* * Concurrent O_DIRECT writes are allowed with * mount_option "coherency=buffered". + * For append write, we must take rw EX. */ - rw_level = (!direct_io || full_coherency); + rw_level = (!direct_io || full_coherency || append_write); ret = ocfs2_rw_lock(inode, rw_level); if (ret < 0) { @@ -2358,13 +2372,6 @@ relock: ocfs2_iocb_set_unaligned_aio(iocb); } - /* - * To later detect whether a journal commit for sync writes is - * necessary, we sample i_size, and cluster count here. - */ - old_size = i_size_read(inode); - old_clusters = OCFS2_I(inode)->ip_clusters; - /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_rw_locked(iocb, rw_level); @@ -2372,6 +2379,20 @@ relock: /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); + /* + * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io + * function pointer which is called when o_direct io completes so that + * it can unlock our rw lock. + * Unfortunately there are error cases which call end_io and others + * that don't. so we don't have to unlock the rw_lock if either an + * async dio is going to do it in the future or an end_io after an + * error has already done it. + */ + if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { + rw_level = -1; + unaligned_dio = 0; + } + if (unlikely(written <= 0)) goto no_sync; @@ -2396,21 +2417,7 @@ relock: } no_sync: - /* - * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io - * function pointer which is called when o_direct io completes so that - * it can unlock our rw lock. - * Unfortunately there are error cases which call end_io and others - * that don't. so we don't have to unlock the rw_lock if either an - * async dio is going to do it in the future or an end_io after an - * error has already done it. - */ - if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { - rw_level = -1; - unaligned_dio = 0; - } - - if (unaligned_dio) { + if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) { ocfs2_iocb_clear_unaligned_aio(iocb); mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); } diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index b254416dc8d9..8f87e05ee25d 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -971,6 +971,7 @@ static void ocfs2_delete_inode(struct inode *inode) int wipe, status; sigset_t oldset; struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; trace_ocfs2_delete_inode(inode->i_ino, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -1025,6 +1026,14 @@ static void ocfs2_delete_inode(struct inode *inode) goto bail_unlock_nfs_sync; } + di = (struct ocfs2_dinode *)di_bh->b_data; + /* Skip inode deletion and wait for dio orphan entry recovered + * first */ + if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { + ocfs2_cleanup_delete_inode(inode, 0); + goto bail_unlock_inode; + } + /* Query the cluster. This will be the final decision made * before we go ahead and wipe the inode. */ status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); @@ -1191,17 +1200,19 @@ void ocfs2_evict_inode(struct inode *inode) int ocfs2_drop_inode(struct inode *inode) { struct ocfs2_inode_info *oi = OCFS2_I(inode); - int res; trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); - if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) - res = 1; - else - res = generic_drop_inode(inode); + assert_spin_locked(&inode->i_lock); + inode->i_state |= I_WILL_FREE; + spin_unlock(&inode->i_lock); + write_inode_now(inode, 1); + spin_lock(&inode->i_lock); + WARN_ON(inode->i_state & I_NEW); + inode->i_state &= ~I_WILL_FREE; - return res; + return 1; } /* @@ -1350,32 +1361,32 @@ int ocfs2_validate_inode_block(struct super_block *sb, rc = -EINVAL; if (!OCFS2_IS_VALID_DINODE(di)) { - ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", - (unsigned long long)bh->b_blocknr, 7, - di->i_signature); + rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + di->i_signature); goto bail; } if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { - ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(di->i_blkno)); + rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(di->i_blkno)); goto bail; } if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { - ocfs2_error(sb, - "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", - (unsigned long long)bh->b_blocknr); + rc = ocfs2_error(sb, + "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", + (unsigned long long)bh->b_blocknr); goto bail; } if (le32_to_cpu(di->i_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Invalid dinode #%llu: fs_generation is %u\n", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(di->i_fs_generation)); + rc = ocfs2_error(sb, + "Invalid dinode #%llu: fs_generation is %u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_fs_generation)); goto bail; } diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 5e86b247c821..ca3431ee7f24 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -81,8 +81,6 @@ struct ocfs2_inode_info tid_t i_sync_tid; tid_t i_datasync_tid; - wait_queue_head_t append_dio_wq; - struct dquot *i_dquot[MAXQUOTAS]; }; diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 53e6c40ed4c6..3cb097ccce60 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -980,7 +980,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) case OCFS2_IOC_GROUP_EXTEND: case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: - case FITRIM: break; case OCFS2_IOC_REFLINK: if (copy_from_user(&args, argp, sizeof(args))) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 7c099f7032fd..ff82b28462a6 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -374,7 +374,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) mlog_errno(PTR_ERR(handle)); if (is_journal_aborted(journal)) { - ocfs2_abort(osb->sb, "Detected aborted journal"); + ocfs2_abort(osb->sb, "Detected aborted journal\n"); handle = ERR_PTR(-EROFS); } } else { @@ -668,7 +668,23 @@ static int __ocfs2_journal_access(handle_t *handle, mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n"); mlog(ML_ERROR, "b_blocknr=%llu\n", (unsigned long long)bh->b_blocknr); - BUG(); + + lock_buffer(bh); + /* + * A previous attempt to write this buffer head failed. + * Nothing we can do but to retry the write and hope for + * the best. + */ + if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) { + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + + if (!buffer_uptodate(bh)) { + unlock_buffer(bh); + return -EIO; + } + unlock_buffer(bh); } /* Set the current transaction information on the ci so @@ -2170,6 +2186,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, iter = oi->ip_next_orphan; oi->ip_next_orphan = NULL; + mutex_lock(&inode->i_mutex); ret = ocfs2_rw_lock(inode, 1); if (ret < 0) { mlog_errno(ret); @@ -2193,7 +2210,9 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, * ocfs2_delete_inode. */ oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; spin_unlock(&oi->ip_lock); - } else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) && + } + + if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) && (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { ret = ocfs2_truncate_file(inode, di_bh, i_size_read(inode)); @@ -2206,17 +2225,16 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0); if (ret) mlog_errno(ret); - - wake_up(&OCFS2_I(inode)->append_dio_wq); } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ unlock_inode: ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + di_bh = NULL; unlock_rw: ocfs2_rw_unlock(inode, 1); next: + mutex_unlock(&inode->i_mutex); iput(inode); - brelse(di_bh); - di_bh = NULL; inode = iter; } diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 857bbbcd39f3..0a4457fb0711 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -665,8 +665,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, #ifdef CONFIG_OCFS2_DEBUG_FS if (le32_to_cpu(alloc->id1.bitmap1.i_used) != ocfs2_local_alloc_count_bits(alloc)) { - ocfs2_error(osb->sb, "local alloc inode %llu says it has " - "%u used bits, but a count shows %u", + ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n", (unsigned long long)le64_to_cpu(alloc->i_blkno), le32_to_cpu(alloc->id1.bitmap1.i_used), ocfs2_local_alloc_count_bits(alloc)); diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 56a768d06aa6..124471d26a73 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -99,11 +99,9 @@ static int __ocfs2_move_extent(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { - ocfs2_error(inode->i_sb, - "Inode %llu has an extent at cpos %u which can no " - "longer be found.\n", - (unsigned long long)ino, cpos); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ino, cpos); goto out; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 6e6abb93fda5..b7dfac226b1e 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -200,11 +200,12 @@ bail: static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) { struct inode *inode; + int status; inode = new_inode(dir->i_sb); if (!inode) { mlog(ML_ERROR, "new_inode failed!\n"); - return NULL; + return ERR_PTR(-ENOMEM); } /* populate as many fields early on as possible - many of @@ -213,7 +214,10 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) if (S_ISDIR(mode)) set_nlink(inode, 2); inode_init_owner(inode, dir, mode); - dquot_initialize(inode); + status = dquot_initialize(inode); + if (status) + return ERR_PTR(status); + return inode; } @@ -264,7 +268,11 @@ static int ocfs2_mknod(struct inode *dir, (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long)dev, mode); - dquot_initialize(dir); + status = dquot_initialize(dir); + if (status) { + mlog_errno(status); + return status; + } /* get our super block */ osb = OCFS2_SB(dir->i_sb); @@ -311,8 +319,9 @@ static int ocfs2_mknod(struct inode *dir, } inode = ocfs2_get_init_inode(dir, mode); - if (!inode) { - status = -ENOMEM; + if (IS_ERR(inode)) { + status = PTR_ERR(inode); + inode = NULL; mlog_errno(status); goto leave; } @@ -708,7 +717,11 @@ static int ocfs2_link(struct dentry *old_dentry, if (S_ISDIR(inode->i_mode)) return -EPERM; - dquot_initialize(dir); + err = dquot_initialize(dir); + if (err) { + mlog_errno(err); + return err; + } err = ocfs2_double_lock(osb, &old_dir_bh, old_dir, &parent_fe_bh, dir, 0); @@ -896,7 +909,11 @@ static int ocfs2_unlink(struct inode *dir, (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long long)OCFS2_I(inode)->ip_blkno); - dquot_initialize(dir); + status = dquot_initialize(dir); + if (status) { + mlog_errno(status); + return status; + } BUG_ON(d_inode(dentry->d_parent) != dir); @@ -1018,11 +1035,6 @@ leave: if (handle) ocfs2_commit_trans(osb, handle); - if (child_locked) - ocfs2_inode_unlock(inode, 1); - - ocfs2_inode_unlock(dir, 1); - if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); @@ -1030,6 +1042,11 @@ leave: iput(orphan_dir); } + if (child_locked) + ocfs2_inode_unlock(inode, 1); + + ocfs2_inode_unlock(dir, 1); + brelse(fe_bh); brelse(parent_node_bh); @@ -1230,8 +1247,16 @@ static int ocfs2_rename(struct inode *old_dir, old_dentry->d_name.len, old_dentry->d_name.name, new_dentry->d_name.len, new_dentry->d_name.name); - dquot_initialize(old_dir); - dquot_initialize(new_dir); + status = dquot_initialize(old_dir); + if (status) { + mlog_errno(status); + goto bail; + } + status = dquot_initialize(new_dir); + if (status) { + mlog_errno(status); + goto bail; + } osb = OCFS2_SB(old_dir->i_sb); @@ -1284,6 +1309,11 @@ static int ocfs2_rename(struct inode *old_dir, } parents_locked = 1; + if (!new_dir->i_nlink) { + status = -EACCES; + goto bail; + } + /* make sure both dirs have bhs * get an extra ref on old_dir_bh if old==new */ if (!new_dir_bh) { @@ -1544,12 +1574,25 @@ static int ocfs2_rename(struct inode *old_dir, status = ocfs2_find_entry(old_dentry->d_name.name, old_dentry->d_name.len, old_dir, &old_entry_lookup); - if (status) + if (status) { + if (!is_journal_aborted(osb->journal->j_journal)) { + ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s " + "is not deleted.", + new_dentry->d_name.len, new_dentry->d_name.name, + old_dentry->d_name.len, old_dentry->d_name.name); + } goto bail; + } status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup); if (status < 0) { mlog_errno(status); + if (!is_journal_aborted(osb->journal->j_journal)) { + ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s " + "is not deleted.", + new_dentry->d_name.len, new_dentry->d_name.name, + old_dentry->d_name.len, old_dentry->d_name.name); + } goto bail; } @@ -1608,21 +1651,9 @@ static int ocfs2_rename(struct inode *old_dir, ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); status = 0; bail: - if (rename_lock) - ocfs2_rename_unlock(osb); - if (handle) ocfs2_commit_trans(osb, handle); - if (parents_locked) - ocfs2_double_unlock(old_dir, new_dir); - - if (old_child_locked) - ocfs2_inode_unlock(old_inode, 1); - - if (new_child_locked) - ocfs2_inode_unlock(new_inode, 1); - if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); @@ -1630,6 +1661,18 @@ bail: iput(orphan_dir); } + if (new_child_locked) + ocfs2_inode_unlock(new_inode, 1); + + if (old_child_locked) + ocfs2_inode_unlock(old_inode, 1); + + if (parents_locked) + ocfs2_double_unlock(old_dir, new_dir); + + if (rename_lock) + ocfs2_rename_unlock(osb); + if (new_inode) sync_mapping_buffers(old_inode->i_mapping); @@ -1786,7 +1829,11 @@ static int ocfs2_symlink(struct inode *dir, trace_ocfs2_symlink_begin(dir, dentry, symname, dentry->d_name.len, dentry->d_name.name); - dquot_initialize(dir); + status = dquot_initialize(dir); + if (status) { + mlog_errno(status); + goto bail; + } sb = dir->i_sb; osb = OCFS2_SB(sb); @@ -1831,8 +1878,9 @@ static int ocfs2_symlink(struct inode *dir, } inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO); - if (!inode) { - status = -ENOMEM; + if (IS_ERR(inode)) { + status = PTR_ERR(inode); + inode = NULL; mlog_errno(status); goto bail; } @@ -2485,8 +2533,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, } inode = ocfs2_get_init_inode(dir, mode); - if (!inode) { - status = -ENOMEM; + if (IS_ERR(inode)) { + status = PTR_ERR(inode); + inode = NULL; mlog_errno(status); goto leave; } @@ -2570,27 +2619,6 @@ leave: return status; } -static int ocfs2_dio_orphan_recovered(struct inode *inode) -{ - int ret; - struct buffer_head *di_bh = NULL; - struct ocfs2_dinode *di = NULL; - - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret < 0) { - mlog_errno(ret); - return 0; - } - - di = (struct ocfs2_dinode *) di_bh->b_data; - ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)); - ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - - return ret; -} - -#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000 int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, struct inode *inode) { @@ -2602,7 +2630,6 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, handle_t *handle = NULL; struct ocfs2_dinode *di = NULL; -restart: status = ocfs2_inode_lock(inode, &di_bh, 1); if (status < 0) { mlog_errno(status); @@ -2612,15 +2639,21 @@ restart: di = (struct ocfs2_dinode *) di_bh->b_data; /* * Another append dio crashed? - * If so, wait for recovery first. + * If so, manually recover it first. */ if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { - ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq, - ocfs2_dio_orphan_recovered(inode), - msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL)); - goto restart; + status = ocfs2_truncate_file(inode, di_bh, i_size_read(inode)); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail_unlock_inode; + } + + status = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_inode; + } } status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 690ddc60189b..7a0126267847 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -286,6 +286,8 @@ enum ocfs2_mount_options OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ + OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */ + OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */ }; #define OCFS2_OSB_SOFT_RO 0x0001 diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index db64ce2d4667..540ab5b75dbb 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -168,7 +168,7 @@ /* Refcount tree support */ #define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000 -/* Discontigous block groups */ +/* Discontiguous block groups */ #define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000 /* @@ -939,7 +939,7 @@ struct ocfs2_group_desc /* * Block groups may be discontiguous when * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set. - * The extents of a discontigous block group are + * The extents of a discontiguous block group are * stored in bg_list. It is a flat list. * l_tree_depth must always be zero. A * discontiguous group is signified by a non-zero diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 3d0b63d34225..8a54fd8a4fa5 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -138,8 +138,7 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block, if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { ocfs2_error(inode->i_sb, - "Quota file %llu is probably corrupted! Requested " - "to read block %Lu but file has size only %Lu\n", + "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)v_block, (unsigned long long)i_size_read(inode)); @@ -499,8 +498,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, dquot = dqget(sb, make_kqid(&init_user_ns, type, le64_to_cpu(dqblk->dqb_id))); - if (!dquot) { - status = -EIO; + if (IS_ERR(dquot)) { + status = PTR_ERR(dquot); mlog(ML_ERROR, "Failed to get quota structure " "for id %u, type %d. Cannot finish quota " "file recovery.\n", diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index b69dd14c0b9b..e5d57cd32505 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -102,32 +102,30 @@ static int ocfs2_validate_refcount_block(struct super_block *sb, if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) { - ocfs2_error(sb, - "Refcount block #%llu has bad signature %.*s", - (unsigned long long)bh->b_blocknr, 7, - rb->rf_signature); - return -EINVAL; + rc = ocfs2_error(sb, + "Refcount block #%llu has bad signature %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + rb->rf_signature); + goto out; } if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) { - ocfs2_error(sb, - "Refcount block #%llu has an invalid rf_blkno " - "of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(rb->rf_blkno)); - return -EINVAL; + rc = ocfs2_error(sb, + "Refcount block #%llu has an invalid rf_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(rb->rf_blkno)); + goto out; } if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Refcount block #%llu has an invalid " - "rf_fs_generation of #%u", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(rb->rf_fs_generation)); - return -EINVAL; + rc = ocfs2_error(sb, + "Refcount block #%llu has an invalid rf_fs_generation of #%u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(rb->rf_fs_generation)); + goto out; } - - return 0; +out: + return rc; } static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci, @@ -1102,12 +1100,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(sb, - "refcount tree %llu has non zero tree " - "depth in leaf btree tree block %llu\n", - (unsigned long long)ocfs2_metadata_cache_owner(ci), - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(sb, + "refcount tree %llu has non zero tree depth in leaf btree tree block %llu\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -2359,10 +2355,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode, cpos, len, phys); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " - "tree, but the feature bit is not set in the " - "super block.", inode->i_ino); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + inode->i_ino); goto out; } @@ -2545,10 +2539,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " - "tree, but the feature bit is not set in the " - "super block.", inode->i_ino); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + inode->i_ino); goto out; } @@ -2672,11 +2664,10 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "leaf block %llu\n", inode->i_ino, - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in leaf block %llu\n", + inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -3106,11 +3097,9 @@ static int ocfs2_clear_ext_refcount(handle_t *handle, index = ocfs2_search_extent_list(el, cpos); if (index == -1) { - ocfs2_error(sb, - "Inode %llu has an extent at cpos %u which can no " - "longer be found.\n", - (unsigned long long)ino, cpos); - ret = -EROFS; + ret = ocfs2_error(sb, + "Inode %llu has an extent at cpos %u which can no longer be found\n", + (unsigned long long)ino, cpos); goto out; } @@ -3376,10 +3365,8 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " - "tree, but the feature bit is not set in the " - "super block.", inode->i_ino); - return -EROFS; + return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + inode->i_ino); } ocfs2_init_dealloc_ctxt(&context->dealloc); @@ -4419,8 +4406,9 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, } mutex_lock(&inode->i_mutex); - dquot_initialize(dir); - error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); + error = dquot_initialize(dir); + if (!error) + error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); mutex_unlock(&inode->i_mutex); if (!error) fsnotify_create(dir, new_dentry); diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 2768eb1da2b8..ced70c8139f7 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -655,14 +655,7 @@ static int ocfs2_control_init(void) static void ocfs2_control_exit(void) { - int rc; - - rc = misc_deregister(&ocfs2_control_device); - if (rc) - printk(KERN_ERR - "ocfs2: Unable to deregister ocfs2_control device " - "(errno %d)\n", - -rc); + misc_deregister(&ocfs2_control_device); } static void fsdlm_lock_ast_wrapper(void *astarg) diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 4479029630bb..d83d2602cf2b 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -149,10 +149,8 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) brelse(ac->ac_bh); ac->ac_bh = NULL; ac->ac_resv = NULL; - if (ac->ac_find_loc_priv) { - kfree(ac->ac_find_loc_priv); - ac->ac_find_loc_priv = NULL; - } + kfree(ac->ac_find_loc_priv); + ac->ac_find_loc_priv = NULL; } void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) @@ -167,12 +165,12 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) } #define do_error(fmt, ...) \ - do{ \ - if (resize) \ - mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ - else \ - ocfs2_error(sb, fmt, ##__VA_ARGS__); \ - } while (0) +do { \ + if (resize) \ + mlog(ML_ERROR, fmt, ##__VA_ARGS__); \ + else \ + return ocfs2_error(sb, fmt, ##__VA_ARGS__); \ +} while (0) static int ocfs2_validate_gd_self(struct super_block *sb, struct buffer_head *bh, @@ -181,44 +179,35 @@ static int ocfs2_validate_gd_self(struct super_block *sb, struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { - do_error("Group descriptor #%llu has bad signature %.*s", + do_error("Group descriptor #%llu has bad signature %.*s\n", (unsigned long long)bh->b_blocknr, 7, gd->bg_signature); - return -EINVAL; } if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { - do_error("Group descriptor #%llu has an invalid bg_blkno " - "of %llu", + do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(gd->bg_blkno)); - return -EINVAL; } if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { - do_error("Group descriptor #%llu has an invalid " - "fs_generation of #%u", + do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n", (unsigned long long)bh->b_blocknr, le32_to_cpu(gd->bg_generation)); - return -EINVAL; } if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { - do_error("Group descriptor #%llu has bit count %u but " - "claims that %u are free", + do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_bits), le16_to_cpu(gd->bg_free_bits_count)); - return -EINVAL; } if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { - do_error("Group descriptor #%llu has bit count %u but " - "max bitmap bits of %u", + do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_bits), 8 * le16_to_cpu(gd->bg_size)); - return -EINVAL; } return 0; @@ -233,20 +222,17 @@ static int ocfs2_validate_gd_parent(struct super_block *sb, struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; if (di->i_blkno != gd->bg_parent_dinode) { - do_error("Group descriptor #%llu has bad parent " - "pointer (%llu, expected %llu)", + do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), (unsigned long long)le64_to_cpu(di->i_blkno)); - return -EINVAL; } max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); if (le16_to_cpu(gd->bg_bits) > max_bits) { - do_error("Group descriptor #%llu has bit count of %u", + do_error("Group descriptor #%llu has bit count of %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_bits)); - return -EINVAL; } /* In resize, we may meet the case bg_chain == cl_next_free_rec. */ @@ -254,10 +240,9 @@ static int ocfs2_validate_gd_parent(struct super_block *sb, le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) || ((le16_to_cpu(gd->bg_chain) == le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) { - do_error("Group descriptor #%llu has bad chain %u", + do_error("Group descriptor #%llu has bad chain %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_chain)); - return -EINVAL; } return 0; @@ -384,11 +369,10 @@ static int ocfs2_block_group_fill(handle_t *handle, struct super_block * sb = alloc_inode->i_sb; if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { - ocfs2_error(alloc_inode->i_sb, "group block (%llu) != " - "b_blocknr (%llu)", - (unsigned long long)group_blkno, - (unsigned long long) bg_bh->b_blocknr); - status = -EIO; + status = ocfs2_error(alloc_inode->i_sb, + "group block (%llu) != b_blocknr (%llu)\n", + (unsigned long long)group_blkno, + (unsigned long long) bg_bh->b_blocknr); goto bail; } @@ -834,9 +818,9 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { - ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu", - (unsigned long long)le64_to_cpu(fe->i_blkno)); - status = -EIO; + status = ocfs2_error(alloc_inode->i_sb, + "Invalid chain allocator %llu\n", + (unsigned long long)le64_to_cpu(fe->i_blkno)); goto bail; } @@ -1370,12 +1354,11 @@ int ocfs2_block_group_set_bits(handle_t *handle, le16_add_cpu(&bg->bg_free_bits_count, -num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { - ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" - " count %u but claims %u are freed. num_bits %d", - (unsigned long long)le64_to_cpu(bg->bg_blkno), - le16_to_cpu(bg->bg_bits), - le16_to_cpu(bg->bg_free_bits_count), num_bits); - return -EROFS; + return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), + num_bits); } while(num_bits--) ocfs2_set_bit(bit_off++, bitmap); @@ -1905,13 +1888,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, if (le32_to_cpu(fe->id1.bitmap1.i_used) >= le32_to_cpu(fe->id1.bitmap1.i_total)) { - ocfs2_error(ac->ac_inode->i_sb, - "Chain allocator dinode %llu has %u used " - "bits but only %u total.", - (unsigned long long)le64_to_cpu(fe->i_blkno), - le32_to_cpu(fe->id1.bitmap1.i_used), - le32_to_cpu(fe->id1.bitmap1.i_total)); - status = -EIO; + status = ocfs2_error(ac->ac_inode->i_sb, + "Chain allocator dinode %llu has %u used bits but only %u total\n", + (unsigned long long)le64_to_cpu(fe->i_blkno), + le32_to_cpu(fe->id1.bitmap1.i_used), + le32_to_cpu(fe->id1.bitmap1.i_total)); goto bail; } @@ -2429,12 +2410,11 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, } le16_add_cpu(&bg->bg_free_bits_count, num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { - ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" - " count %u but claims %u are freed. num_bits %d", - (unsigned long long)le64_to_cpu(bg->bg_blkno), - le16_to_cpu(bg->bg_bits), - le16_to_cpu(bg->bg_free_bits_count), num_bits); - return -EROFS; + return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), + num_bits); } if (undo_fn) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 403c5660b306..2de4c8a9340c 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -192,6 +192,7 @@ enum { Opt_resv_level, Opt_dir_resv_level, Opt_journal_async_commit, + Opt_err_cont, Opt_err, }; @@ -224,6 +225,7 @@ static const match_table_t tokens = { {Opt_resv_level, "resv_level=%u"}, {Opt_dir_resv_level, "dir_resv_level=%u"}, {Opt_journal_async_commit, "journal_async_commit"}, + {Opt_err_cont, "errors=continue"}, {Opt_err, NULL} }; @@ -1330,10 +1332,19 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->mount_opt |= OCFS2_MOUNT_NOINTR; break; case Opt_err_panic: + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT; + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS; mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; break; case Opt_err_ro: + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT; mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; + mopt->mount_opt |= OCFS2_MOUNT_ERRORS_ROFS; + break; + case Opt_err_cont: + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS; + mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; + mopt->mount_opt |= OCFS2_MOUNT_ERRORS_CONT; break; case Opt_data_ordered: mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; @@ -1530,6 +1541,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) if (opts & OCFS2_MOUNT_ERRORS_PANIC) seq_printf(s, ",errors=panic"); + else if (opts & OCFS2_MOUNT_ERRORS_CONT) + seq_printf(s, ",errors=continue"); else seq_printf(s, ",errors=remount-ro"); @@ -1550,8 +1563,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",localflocks,"); if (osb->osb_cluster_stack[0]) - seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, - osb->osb_cluster_stack); + seq_show_option_n(s, "cluster_stack", osb->osb_cluster_stack, + OCFS2_STACK_LABEL_LEN); if (opts & OCFS2_MOUNT_USRQUOTA) seq_printf(s, ",usrquota"); if (opts & OCFS2_MOUNT_GRPQUOTA) @@ -1746,8 +1759,6 @@ static void ocfs2_inode_init_once(void *data) ocfs2_lock_res_init_once(&oi->ip_inode_lockres); ocfs2_lock_res_init_once(&oi->ip_open_lockres); - init_waitqueue_head(&oi->append_dio_wq); - ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode), &ocfs2_inode_caching_ops); @@ -2541,31 +2552,43 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) memset(osb, 0, sizeof(struct ocfs2_super)); } -/* Put OCFS2 into a readonly state, or (if the user specifies it), - * panic(). We do not support continue-on-error operation. */ -static void ocfs2_handle_error(struct super_block *sb) +/* Depending on the mount option passed, perform one of the following: + * Put OCFS2 into a readonly state (default) + * Return EIO so that only the process errs + * Fix the error as if fsck.ocfs2 -y + * panic + */ +static int ocfs2_handle_error(struct super_block *sb) { struct ocfs2_super *osb = OCFS2_SB(sb); - - if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) - panic("OCFS2: (device %s): panic forced after error\n", - sb->s_id); + int rv = 0; ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS); + pr_crit("On-disk corruption discovered. " + "Please run fsck.ocfs2 once the filesystem is unmounted.\n"); - if (sb->s_flags & MS_RDONLY && - (ocfs2_is_soft_readonly(osb) || - ocfs2_is_hard_readonly(osb))) - return; - - printk(KERN_CRIT "File system is now read-only due to the potential " - "of on-disk corruption. Please run fsck.ocfs2 once the file " - "system is unmounted.\n"); - sb->s_flags |= MS_RDONLY; - ocfs2_set_ro_flag(osb, 0); + if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) { + panic("OCFS2: (device %s): panic forced after error\n", + sb->s_id); + } else if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_CONT) { + pr_crit("OCFS2: Returning error to the calling process.\n"); + rv = -EIO; + } else { /* default option */ + rv = -EROFS; + if (sb->s_flags & MS_RDONLY && + (ocfs2_is_soft_readonly(osb) || + ocfs2_is_hard_readonly(osb))) + return rv; + + pr_crit("OCFS2: File system is now read-only.\n"); + sb->s_flags |= MS_RDONLY; + ocfs2_set_ro_flag(osb, 0); + } + + return rv; } -void __ocfs2_error(struct super_block *sb, const char *function, +int __ocfs2_error(struct super_block *sb, const char *function, const char *fmt, ...) { struct va_format vaf; @@ -2577,12 +2600,12 @@ void __ocfs2_error(struct super_block *sb, const char *function, /* Not using mlog here because we want to show the actual * function the error came from. */ - printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n", + printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV", sb->s_id, function, &vaf); va_end(args); - ocfs2_handle_error(sb); + return ocfs2_handle_error(sb); } /* Handle critical errors. This is intentionally more drastic than @@ -2599,7 +2622,7 @@ void __ocfs2_abort(struct super_block *sb, const char *function, vaf.fmt = fmt; vaf.va = &args; - printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n", + printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV", sb->s_id, function, &vaf); va_end(args); diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index 74ff74cf78fe..b477d0b1c7b6 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h @@ -32,16 +32,18 @@ int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, int node_num); __printf(3, 4) -void __ocfs2_error(struct super_block *sb, const char *function, +int __ocfs2_error(struct super_block *sb, const char *function, const char *fmt, ...); -#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args) +#define ocfs2_error(sb, fmt, ...) \ + __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__) __printf(3, 4) void __ocfs2_abort(struct super_block *sb, const char *function, const char *fmt, ...); -#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) +#define ocfs2_abort(sb, fmt, ...) \ + __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__) /* * Void signal blockers, because in-kernel sigprocmask() only fails diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 889f3796a0d7..ebfdea78659b 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -499,30 +499,24 @@ static int ocfs2_validate_xattr_block(struct super_block *sb, */ if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { - ocfs2_error(sb, - "Extended attribute block #%llu has bad " - "signature %.*s", - (unsigned long long)bh->b_blocknr, 7, - xb->xb_signature); - return -EINVAL; + return ocfs2_error(sb, + "Extended attribute block #%llu has bad signature %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + xb->xb_signature); } if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) { - ocfs2_error(sb, - "Extended attribute block #%llu has an " - "invalid xb_blkno of %llu", - (unsigned long long)bh->b_blocknr, - (unsigned long long)le64_to_cpu(xb->xb_blkno)); - return -EINVAL; + return ocfs2_error(sb, + "Extended attribute block #%llu has an invalid xb_blkno of %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(xb->xb_blkno)); } if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) { - ocfs2_error(sb, - "Extended attribute block #%llu has an invalid " - "xb_fs_generation of #%u", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(xb->xb_fs_generation)); - return -EINVAL; + return ocfs2_error(sb, + "Extended attribute block #%llu has an invalid xb_fs_generation of #%u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(xb->xb_fs_generation)); } return 0; @@ -3694,11 +3688,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode, el = &eb->h_list; if (el->l_tree_depth) { - ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in " - "xattr tree block %llu\n", inode->i_ino, - (unsigned long long)eb_bh->b_blocknr); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in xattr tree block %llu\n", + inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); goto out; } } @@ -3713,11 +3706,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode, } if (!e_blkno) { - ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0) in xattr", inode->i_ino, - le32_to_cpu(rec->e_cpos), - ocfs2_rec_clusters(el, rec)); - ret = -EROFS; + ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent record (%u, %u, 0) in xattr\n", + inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); goto out; } @@ -7334,6 +7326,9 @@ static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list, const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; + if (!capable(CAP_SYS_ADMIN)) + return 0; + if (list && total_len <= list_size) { memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); memcpy(list + prefix_len, name, name_len); diff --git a/fs/open.c b/fs/open.c index e0250bdcc440..b6f1e96a7c0b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -51,8 +51,10 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, newattrs.ia_valid |= ATTR_FILE; } - /* Remove suid/sgid on truncate too */ - ret = should_remove_suid(dentry); + /* Remove suid, sgid, and file capabilities on truncate too */ + ret = dentry_needs_remove_privs(dentry); + if (ret < 0) + return ret; if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; @@ -375,7 +377,7 @@ retry: * with the "noexec" flag. */ res = -EACCES; - if (path.mnt->mnt_flags & MNT_NOEXEC) + if (path_noexec(&path)) goto out_path_release; } @@ -678,18 +680,18 @@ int open_check_o_direct(struct file *f) } static int do_dentry_open(struct file *f, + struct inode *inode, int (*open)(struct inode *, struct file *), const struct cred *cred) { static const struct file_operations empty_fops = {}; - struct inode *inode; int error; f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; path_get(&f->f_path); - inode = f->f_inode = f->f_path.dentry->d_inode; + f->f_inode = inode; f->f_mapping = inode->i_mapping; if (unlikely(f->f_flags & O_PATH)) { @@ -793,7 +795,8 @@ int finish_open(struct file *file, struct dentry *dentry, BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ file->f_path.dentry = dentry; - error = do_dentry_open(file, open, current_cred()); + error = do_dentry_open(file, d_backing_inode(dentry), open, + current_cred()); if (!error) *opened |= FILE_OPENED; @@ -822,6 +825,34 @@ int finish_no_open(struct file *file, struct dentry *dentry) } EXPORT_SYMBOL(finish_no_open); +char *file_path(struct file *filp, char *buf, int buflen) +{ + return d_path(&filp->f_path, buf, buflen); +} +EXPORT_SYMBOL(file_path); + +/** + * vfs_open - open the file at the given path + * @path: path to open + * @file: newly allocated file with f_flag initialized + * @cred: credentials to use + */ +int vfs_open(const struct path *path, struct file *file, + const struct cred *cred) +{ + struct dentry *dentry = path->dentry; + struct inode *inode = dentry->d_inode; + + file->f_path = *path; + if (dentry->d_flags & DCACHE_OP_SELECT_INODE) { + inode = dentry->d_op->d_select_inode(dentry, file->f_flags); + if (IS_ERR(inode)) + return PTR_ERR(inode); + } + + return do_dentry_open(file, inode, NULL, cred); +} + struct file *dentry_open(const struct path *path, int flags, const struct cred *cred) { @@ -853,26 +884,6 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); -/** - * vfs_open - open the file at the given path - * @path: path to open - * @filp: newly allocated file with f_flag initialized - * @cred: credentials to use - */ -int vfs_open(const struct path *path, struct file *filp, - const struct cred *cred) -{ - struct inode *inode = path->dentry->d_inode; - - if (inode->i_op->dentry_open) - return inode->i_op->dentry_open(path->dentry, filp, cred); - else { - filp->f_path = *path; - return do_dentry_open(filp, NULL, cred); - } -} -EXPORT_SYMBOL(vfs_open); - static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) { int lookup_flags = 0; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 308379b2d0b2..d9da5a4e9382 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -337,37 +337,33 @@ static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type, return true; } -static int ovl_dentry_open(struct dentry *dentry, struct file *file, - const struct cred *cred) +struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags) { int err; struct path realpath; enum ovl_path_type type; - bool want_write = false; + + if (d_is_dir(dentry)) + return d_backing_inode(dentry); type = ovl_path_real(dentry, &realpath); - if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) { - want_write = true; + if (ovl_open_need_copy_up(file_flags, type, realpath.dentry)) { err = ovl_want_write(dentry); if (err) - goto out; + return ERR_PTR(err); - if (file->f_flags & O_TRUNC) + if (file_flags & O_TRUNC) err = ovl_copy_up_last(dentry, NULL, true); else err = ovl_copy_up(dentry); + ovl_drop_write(dentry); if (err) - goto out_drop_write; + return ERR_PTR(err); ovl_path_upper(dentry, &realpath); } - err = vfs_open(&realpath, file, cred); -out_drop_write: - if (want_write) - ovl_drop_write(dentry); -out: - return err; + return d_backing_inode(realpath.dentry); } static const struct inode_operations ovl_file_inode_operations = { @@ -378,7 +374,6 @@ static const struct inode_operations ovl_file_inode_operations = { .getxattr = ovl_getxattr, .listxattr = ovl_listxattr, .removexattr = ovl_removexattr, - .dentry_open = ovl_dentry_open, }; static const struct inode_operations ovl_symlink_inode_operations = { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 17ac5afc9ffb..ea5a40b06e3a 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -173,6 +173,7 @@ ssize_t ovl_getxattr(struct dentry *dentry, const char *name, void *value, size_t size); ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); int ovl_removexattr(struct dentry *dentry, const char *name); +struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags); struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, struct ovl_entry *oe); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 907870e81a72..70e9af551600 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -23,6 +23,7 @@ struct ovl_cache_entry { u64 ino; struct list_head l_node; struct rb_node node; + struct ovl_cache_entry *next_maybe_whiteout; bool is_whiteout; char name[]; }; @@ -39,7 +40,7 @@ struct ovl_readdir_data { struct rb_root root; struct list_head *list; struct list_head middle; - struct dentry *dir; + struct ovl_cache_entry *first_maybe_whiteout; int count; int err; }; @@ -79,7 +80,7 @@ static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, return NULL; } -static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir, +static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, const char *name, int len, u64 ino, unsigned int d_type) { @@ -98,29 +99,8 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir, p->is_whiteout = false; if (d_type == DT_CHR) { - struct dentry *dentry; - const struct cred *old_cred; - struct cred *override_cred; - - override_cred = prepare_creds(); - if (!override_cred) { - kfree(p); - return NULL; - } - - /* - * CAP_DAC_OVERRIDE for lookup - */ - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - old_cred = override_creds(override_cred); - - dentry = lookup_one_len(name, dir, len); - if (!IS_ERR(dentry)) { - p->is_whiteout = ovl_is_whiteout(dentry); - dput(dentry); - } - revert_creds(old_cred); - put_cred(override_cred); + p->next_maybe_whiteout = rdd->first_maybe_whiteout; + rdd->first_maybe_whiteout = p; } return p; } @@ -148,7 +128,7 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, return 0; } - p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type); + p = ovl_cache_entry_new(rdd, name, len, ino, d_type); if (p == NULL) return -ENOMEM; @@ -169,7 +149,7 @@ static int ovl_fill_lower(struct ovl_readdir_data *rdd, if (p) { list_move_tail(&p->l_node, &rdd->middle); } else { - p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type); + p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type); if (p == NULL) rdd->err = -ENOMEM; else @@ -219,6 +199,43 @@ static int ovl_fill_merge(struct dir_context *ctx, const char *name, return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type); } +static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) +{ + int err; + struct ovl_cache_entry *p; + struct dentry *dentry; + const struct cred *old_cred; + struct cred *override_cred; + + override_cred = prepare_creds(); + if (!override_cred) + return -ENOMEM; + + /* + * CAP_DAC_OVERRIDE for lookup + */ + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + old_cred = override_creds(override_cred); + + err = mutex_lock_killable(&dir->d_inode->i_mutex); + if (!err) { + while (rdd->first_maybe_whiteout) { + p = rdd->first_maybe_whiteout; + rdd->first_maybe_whiteout = p->next_maybe_whiteout; + dentry = lookup_one_len(p->name, dir, p->len); + if (!IS_ERR(dentry)) { + p->is_whiteout = ovl_is_whiteout(dentry); + dput(dentry); + } + } + mutex_unlock(&dir->d_inode->i_mutex); + } + revert_creds(old_cred); + put_cred(override_cred); + + return err; +} + static inline int ovl_dir_read(struct path *realpath, struct ovl_readdir_data *rdd) { @@ -229,7 +246,7 @@ static inline int ovl_dir_read(struct path *realpath, if (IS_ERR(realfile)) return PTR_ERR(realfile); - rdd->dir = realpath->dentry; + rdd->first_maybe_whiteout = NULL; rdd->ctx.pos = 0; do { rdd->count = 0; @@ -238,6 +255,10 @@ static inline int ovl_dir_read(struct path *realpath, if (err >= 0) err = rdd->err; } while (!err && rdd->count); + + if (!err && rdd->first_maybe_whiteout) + err = ovl_check_whiteouts(realpath->dentry, rdd); + fput(realfile); return err; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index bf8537c7f455..79073d68b475 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -273,8 +273,56 @@ static void ovl_dentry_release(struct dentry *dentry) } } +static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct ovl_entry *oe = dentry->d_fsdata; + unsigned int i; + int ret = 1; + + for (i = 0; i < oe->numlower; i++) { + struct dentry *d = oe->lowerstack[i].dentry; + + if (d->d_flags & DCACHE_OP_REVALIDATE) { + ret = d->d_op->d_revalidate(d, flags); + if (ret < 0) + return ret; + if (!ret) { + if (!(flags & LOOKUP_RCU)) + d_invalidate(d); + return -ESTALE; + } + } + } + return 1; +} + +static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct ovl_entry *oe = dentry->d_fsdata; + unsigned int i; + int ret = 1; + + for (i = 0; i < oe->numlower; i++) { + struct dentry *d = oe->lowerstack[i].dentry; + + if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) { + ret = d->d_op->d_weak_revalidate(d, flags); + if (ret <= 0) + break; + } + } + return ret; +} + static const struct dentry_operations ovl_dentry_operations = { .d_release = ovl_dentry_release, + .d_select_inode = ovl_d_select_inode, +}; + +static const struct dentry_operations ovl_reval_dentry_operations = { + .d_release = ovl_dentry_release, + .d_revalidate = ovl_dentry_revalidate, + .d_weak_revalidate = ovl_dentry_weak_revalidate, }; static struct ovl_entry *ovl_alloc_entry(unsigned int numlower) @@ -288,6 +336,20 @@ static struct ovl_entry *ovl_alloc_entry(unsigned int numlower) return oe; } +static bool ovl_dentry_remote(struct dentry *dentry) +{ + return dentry->d_flags & + (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE); +} + +static bool ovl_dentry_weird(struct dentry *dentry) +{ + return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT | + DCACHE_MANAGE_TRANSIT | + DCACHE_OP_HASH | + DCACHE_OP_COMPARE); +} + static inline struct dentry *ovl_lookup_real(struct dentry *dir, struct qstr *name) { @@ -303,6 +365,10 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir, } else if (!dentry->d_inode) { dput(dentry); dentry = NULL; + } else if (ovl_dentry_weird(dentry)) { + dput(dentry); + /* Don't support traversing automounts and other weirdness */ + dentry = ERR_PTR(-EREMOTE); } return dentry; } @@ -350,6 +416,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, goto out; if (this) { + if (unlikely(ovl_dentry_remote(this))) { + dput(this); + err = -EREMOTE; + goto out; + } if (ovl_is_whiteout(this)) { dput(this); this = NULL; @@ -517,10 +588,10 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) struct super_block *sb = dentry->d_sb; struct ovl_fs *ufs = sb->s_fs_info; - seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir); + seq_show_option(m, "lowerdir", ufs->config.lowerdir); if (ufs->config.upperdir) { - seq_printf(m, ",upperdir=%s", ufs->config.upperdir); - seq_printf(m, ",workdir=%s", ufs->config.workdir); + seq_show_option(m, "upperdir", ufs->config.upperdir); + seq_show_option(m, "workdir", ufs->config.workdir); } return 0; } @@ -694,25 +765,6 @@ static void ovl_unescape(char *s) } } -static bool ovl_is_allowed_fs_type(struct dentry *root) -{ - const struct dentry_operations *dop = root->d_op; - - /* - * We don't support: - * - automount filesystems - * - filesystems with revalidate (FIXME for lower layer) - * - filesystems with case insensitive names - */ - if (dop && - (dop->d_manage || dop->d_automount || - dop->d_revalidate || dop->d_weak_revalidate || - dop->d_compare || dop->d_hash)) { - return false; - } - return true; -} - static int ovl_mount_dir_noesc(const char *name, struct path *path) { int err = -EINVAL; @@ -727,7 +779,7 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path) goto out; } err = -EINVAL; - if (!ovl_is_allowed_fs_type(path->dentry)) { + if (ovl_dentry_weird(path->dentry)) { pr_err("overlayfs: filesystem on '%s' not supported\n", name); goto out_put; } @@ -751,13 +803,21 @@ static int ovl_mount_dir(const char *name, struct path *path) if (tmp) { ovl_unescape(tmp); err = ovl_mount_dir_noesc(tmp, path); + + if (!err) + if (ovl_dentry_remote(path->dentry)) { + pr_err("overlayfs: filesystem on '%s' not supported as upperdir\n", + tmp); + path_put(path); + err = -EINVAL; + } kfree(tmp); } return err; } static int ovl_lower_dir(const char *name, struct path *path, long *namelen, - int *stack_depth) + int *stack_depth, bool *remote) { int err; struct kstatfs statfs; @@ -774,6 +834,9 @@ static int ovl_lower_dir(const char *name, struct path *path, long *namelen, *namelen = max(*namelen, statfs.f_namelen); *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth); + if (ovl_dentry_remote(path->dentry)) + *remote = true; + return 0; out_put: @@ -827,6 +890,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) unsigned int numlower; unsigned int stacklen = 0; unsigned int i; + bool remote = false; int err; err = -ENOMEM; @@ -900,7 +964,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) lower = lowertmp; for (numlower = 0; numlower < stacklen; numlower++) { err = ovl_lower_dir(lower, &stack[numlower], - &ufs->lower_namelen, &sb->s_stack_depth); + &ufs->lower_namelen, &sb->s_stack_depth, + &remote); if (err) goto out_put_lowerpath; @@ -958,7 +1023,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!ufs->upper_mnt) sb->s_flags |= MS_RDONLY; - sb->s_d_op = &ovl_dentry_operations; + if (remote) + sb->s_d_op = &ovl_reval_dentry_operations; + else + sb->s_d_op = &ovl_dentry_operations; err = -ENOMEM; oe = ovl_alloc_entry(numlower); diff --git a/fs/pnode.h b/fs/pnode.h index 7114ce6e6b9e..0fcdbe7ca648 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -20,8 +20,6 @@ #define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED) #define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED) #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED) -#define IS_MNT_LOCKED_AND_LAZY(m) \ - (((m)->mnt.mnt_flags & (MNT_LOCKED|MNT_SYNC_UMOUNT)) == MNT_LOCKED) #define CL_EXPIRE 0x01 #define CL_SLAVE 0x02 diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 84bb65b83570..4fb17ded7d47 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -547,51 +547,45 @@ posix_acl_create(struct inode *dir, umode_t *mode, struct posix_acl **default_acl, struct posix_acl **acl) { struct posix_acl *p; + struct posix_acl *clone; int ret; + *acl = NULL; + *default_acl = NULL; + if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) - goto no_acl; + return 0; p = get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(p)) { - if (p == ERR_PTR(-EOPNOTSUPP)) - goto apply_umask; - return PTR_ERR(p); + if (!p || p == ERR_PTR(-EOPNOTSUPP)) { + *mode &= ~current_umask(); + return 0; } + if (IS_ERR(p)) + return PTR_ERR(p); - if (!p) - goto apply_umask; - - *acl = posix_acl_clone(p, GFP_NOFS); - if (!*acl) + clone = posix_acl_clone(p, GFP_NOFS); + if (!clone) goto no_mem; - ret = posix_acl_create_masq(*acl, mode); + ret = posix_acl_create_masq(clone, mode); if (ret < 0) goto no_mem_clone; - if (ret == 0) { - posix_acl_release(*acl); - *acl = NULL; - } + if (ret == 0) + posix_acl_release(clone); + else + *acl = clone; - if (!S_ISDIR(*mode)) { + if (!S_ISDIR(*mode)) posix_acl_release(p); - *default_acl = NULL; - } else { + else *default_acl = p; - } - return 0; -apply_umask: - *mode &= ~current_umask(); -no_acl: - *default_acl = NULL; - *acl = NULL; return 0; no_mem_clone: - posix_acl_release(*acl); + posix_acl_release(clone); no_mem: posix_acl_release(p); return -ENOMEM; diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index d751fcb637bb..1ade1206bb89 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -75,3 +75,9 @@ config PROC_PAGE_MONITOR config PROC_CHILDREN bool "Include /proc/<pid>/task/<tid>/children file" default n + help + Provides a fast way to retrieve first level children pids of a task. See + <file:Documentation/filesystems/proc.txt> for more information. + + Say Y if you are running any user-space software which takes benefit from + this interface. For example, rkt is such a piece of software. diff --git a/fs/proc/array.c b/fs/proc/array.c index ce065cf3104f..f60f0121e331 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -308,7 +308,8 @@ static void render_cap_t(struct seq_file *m, const char *header, static inline void task_cap(struct seq_file *m, struct task_struct *p) { const struct cred *cred; - kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset; + kernel_cap_t cap_inheritable, cap_permitted, cap_effective, + cap_bset, cap_ambient; rcu_read_lock(); cred = __task_cred(p); @@ -316,12 +317,14 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) cap_permitted = cred->cap_permitted; cap_effective = cred->cap_effective; cap_bset = cred->cap_bset; + cap_ambient = cred->cap_ambient; rcu_read_unlock(); render_cap_t(m, "CapInh:\t", &cap_inheritable); render_cap_t(m, "CapPrm:\t", &cap_permitted); render_cap_t(m, "CapEff:\t", &cap_effective); render_cap_t(m, "CapBnd:\t", &cap_bset); + render_cap_t(m, "CapAmb:\t", &cap_ambient); } static inline void task_seccomp(struct seq_file *m, struct task_struct *p) diff --git a/fs/proc/base.c b/fs/proc/base.c index 1d540b3f226f..b25eee4cead5 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -243,6 +243,11 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, len1 = arg_end - arg_start; len2 = env_end - env_start; + /* Empty ARGV. */ + if (len1 == 0) { + rv = 0; + goto out_free_page; + } /* * Inherently racy -- command line shares address space * with code and data. @@ -491,14 +496,17 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, } #endif -#ifdef CONFIG_SCHEDSTATS +#ifdef CONFIG_SCHED_INFO /* * Provides /proc/PID/schedstat */ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - seq_printf(m, "%llu %llu %lu\n", + if (unlikely(!sched_info_on())) + seq_printf(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", (unsigned long long)task->se.sum_exec_runtime, (unsigned long long)task->sched_info.run_delay, task->sched_info.pcount); @@ -1222,10 +1230,9 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); - char *page, *tmp; - ssize_t length; uid_t loginuid; kuid_t kloginuid; + int rv; rcu_read_lock(); if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { @@ -1234,46 +1241,28 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, } rcu_read_unlock(); - if (count >= PAGE_SIZE) - count = PAGE_SIZE - 1; - if (*ppos != 0) { /* No partial writes. */ return -EINVAL; } - page = (char*)__get_free_page(GFP_TEMPORARY); - if (!page) - return -ENOMEM; - length = -EFAULT; - if (copy_from_user(page, buf, count)) - goto out_free_page; - page[count] = '\0'; - loginuid = simple_strtoul(page, &tmp, 10); - if (tmp == page) { - length = -EINVAL; - goto out_free_page; - - } + rv = kstrtou32_from_user(buf, count, 10, &loginuid); + if (rv < 0) + return rv; /* is userspace tring to explicitly UNSET the loginuid? */ if (loginuid == AUDIT_UID_UNSET) { kloginuid = INVALID_UID; } else { kloginuid = make_kuid(file->f_cred->user_ns, loginuid); - if (!uid_valid(kloginuid)) { - length = -EINVAL; - goto out_free_page; - } + if (!uid_valid(kloginuid)) + return -EINVAL; } - length = audit_set_loginuid(kloginuid); - if (likely(length == 0)) - length = count; - -out_free_page: - free_page((unsigned long) page); - return length; + rv = audit_set_loginuid(kloginuid); + if (rv < 0) + return rv; + return count; } static const struct file_operations proc_loginuid_operations = { @@ -1327,8 +1316,9 @@ static ssize_t proc_fault_inject_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct task_struct *task; - char buffer[PROC_NUMBUF], *end; + char buffer[PROC_NUMBUF]; int make_it_fail; + int rv; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; @@ -1337,9 +1327,9 @@ static ssize_t proc_fault_inject_write(struct file * file, count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - make_it_fail = simple_strtol(strstrip(buffer), &end, 0); - if (*end) - return -EINVAL; + rv = kstrtoint(strstrip(buffer), 0, &make_it_fail); + if (rv < 0) + return rv; if (make_it_fail < 0 || make_it_fail > 1) return -EINVAL; @@ -1828,8 +1818,6 @@ end_instantiate: return dir_emit(ctx, name, len, 1, DT_UNKNOWN); } -#ifdef CONFIG_CHECKPOINT_RESTORE - /* * dname_to_vma_addr - maps a dentry name into two unsigned longs * which represent vma start and end addresses. @@ -1856,11 +1844,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - if (!capable(CAP_SYS_ADMIN)) { - status = -EPERM; - goto out_notask; - } - inode = d_inode(dentry); task = get_proc_task(inode); if (!task) @@ -1949,6 +1932,29 @@ struct map_files_info { unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ }; +/* + * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the + * symlinks may be used to bypass permissions on ancestor directories in the + * path to the file in question. + */ +static const char * +proc_map_files_follow_link(struct dentry *dentry, void **cookie) +{ + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + return proc_pid_follow_link(dentry, NULL); +} + +/* + * Identical to proc_pid_link_inode_operations except for follow_link() + */ +static const struct inode_operations proc_map_files_link_inode_operations = { + .readlink = proc_pid_readlink, + .follow_link = proc_map_files_follow_link, + .setattr = proc_setattr, +}; + static int proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) @@ -1964,7 +1970,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, ei = PROC_I(inode); ei->op.proc_get_link = proc_map_files_get_link; - inode->i_op = &proc_pid_link_inode_operations; + inode->i_op = &proc_map_files_link_inode_operations; inode->i_size = 64; inode->i_mode = S_IFLNK; @@ -1988,10 +1994,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, int result; struct mm_struct *mm; - result = -EPERM; - if (!capable(CAP_SYS_ADMIN)) - goto out; - result = -ENOENT; task = get_proc_task(dir); if (!task) @@ -2045,10 +2047,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) struct map_files_info *p; int ret; - ret = -EPERM; - if (!capable(CAP_SYS_ADMIN)) - goto out; - ret = -ENOENT; task = get_proc_task(file_inode(file)); if (!task) @@ -2237,7 +2235,6 @@ static const struct file_operations proc_timers_operations = { .llseek = seq_lseek, .release = seq_release_private, }; -#endif /* CONFIG_CHECKPOINT_RESTORE */ static int proc_pident_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) @@ -2473,32 +2470,20 @@ static ssize_t proc_coredump_filter_write(struct file *file, { struct task_struct *task; struct mm_struct *mm; - char buffer[PROC_NUMBUF], *end; unsigned int val; int ret; int i; unsigned long mask; - ret = -EFAULT; - memset(buffer, 0, sizeof(buffer)); - if (count > sizeof(buffer) - 1) - count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) - goto out_no_task; - - ret = -EINVAL; - val = (unsigned int)simple_strtoul(buffer, &end, 0); - if (*end == '\n') - end++; - if (end - buffer == 0) - goto out_no_task; + ret = kstrtouint_from_user(buf, count, 0, &val); + if (ret < 0) + return ret; ret = -ESRCH; task = get_proc_task(file_inode(file)); if (!task) goto out_no_task; - ret = end - buffer; mm = get_task_mm(task); if (!mm) goto out_no_mm; @@ -2514,7 +2499,9 @@ static ssize_t proc_coredump_filter_write(struct file *file, out_no_mm: put_task_struct(task); out_no_task: - return ret; + if (ret < 0) + return ret; + return count; } static const struct file_operations proc_coredump_filter_operations = { @@ -2736,9 +2723,7 @@ static const struct inode_operations proc_task_inode_operations; static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), -#ifdef CONFIG_CHECKPOINT_RESTORE DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), -#endif DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET @@ -2787,7 +2772,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif -#ifdef CONFIG_SCHEDSTATS +#ifdef CONFIG_SCHED_INFO ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP @@ -3135,7 +3120,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif -#ifdef CONFIG_SCHEDSTATS +#ifdef CONFIG_SCHED_INFO ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP diff --git a/fs/proc/generic.c b/fs/proc/generic.c index df6327a2b865..ff3ffc76a937 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -26,7 +26,7 @@ #include "internal.h" -static DEFINE_SPINLOCK(proc_subdir_lock); +static DEFINE_RWLOCK(proc_subdir_lock); static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) { @@ -172,9 +172,9 @@ static int xlate_proc_name(const char *name, struct proc_dir_entry **ret, { int rv; - spin_lock(&proc_subdir_lock); + read_lock(&proc_subdir_lock); rv = __xlate_proc_name(name, ret, residual); - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); return rv; } @@ -231,11 +231,11 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, { struct inode *inode; - spin_lock(&proc_subdir_lock); + read_lock(&proc_subdir_lock); de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len); if (de) { pde_get(de); - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); inode = proc_get_inode(dir->i_sb, de); if (!inode) return ERR_PTR(-ENOMEM); @@ -243,7 +243,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, d_add(dentry, inode); return NULL; } - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); return ERR_PTR(-ENOENT); } @@ -270,12 +270,12 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, if (!dir_emit_dots(file, ctx)) return 0; - spin_lock(&proc_subdir_lock); + read_lock(&proc_subdir_lock); de = pde_subdir_first(de); i = ctx->pos - 2; for (;;) { if (!de) { - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); return 0; } if (!i) @@ -287,19 +287,19 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, do { struct proc_dir_entry *next; pde_get(de); - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); if (!dir_emit(ctx, de->name, de->namelen, de->low_ino, de->mode >> 12)) { pde_put(de); return 0; } - spin_lock(&proc_subdir_lock); + read_lock(&proc_subdir_lock); ctx->pos++; next = pde_subdir_next(de); pde_put(de); de = next; } while (de); - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); return 1; } @@ -338,16 +338,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp if (ret) return ret; - spin_lock(&proc_subdir_lock); + write_lock(&proc_subdir_lock); dp->parent = dir; if (pde_subdir_insert(dir, dp) == false) { WARN(1, "proc_dir_entry '%s/%s' already registered\n", dir->name, dp->name); - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); proc_free_inum(dp->low_ino); return -EEXIST; } - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); return 0; } @@ -373,6 +373,10 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, WARN(1, "create '/proc/%s' by hand\n", qstr.name); return NULL; } + if (is_empty_pde(*parent)) { + WARN(1, "attempt to add to permanently empty directory"); + return NULL; + } ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL); if (!ent) @@ -455,6 +459,25 @@ struct proc_dir_entry *proc_mkdir(const char *name, } EXPORT_SYMBOL(proc_mkdir); +struct proc_dir_entry *proc_create_mount_point(const char *name) +{ + umode_t mode = S_IFDIR | S_IRUGO | S_IXUGO; + struct proc_dir_entry *ent, *parent = NULL; + + ent = __proc_create(&parent, name, mode, 2); + if (ent) { + ent->data = NULL; + ent->proc_fops = NULL; + ent->proc_iops = NULL; + if (proc_register(parent, ent) < 0) { + kfree(ent); + parent->nlink--; + ent = NULL; + } + } + return ent; +} + struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops, @@ -526,9 +549,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) const char *fn = name; unsigned int len; - spin_lock(&proc_subdir_lock); + write_lock(&proc_subdir_lock); if (__xlate_proc_name(name, &parent, &fn) != 0) { - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); return; } len = strlen(fn); @@ -536,7 +559,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) de = pde_subdir_find(parent, fn, len); if (de) rb_erase(&de->subdir_node, &parent->subdir); - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); if (!de) { WARN(1, "name '%s'\n", name); return; @@ -560,16 +583,16 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) const char *fn = name; unsigned int len; - spin_lock(&proc_subdir_lock); + write_lock(&proc_subdir_lock); if (__xlate_proc_name(name, &parent, &fn) != 0) { - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); return -ENOENT; } len = strlen(fn); root = pde_subdir_find(parent, fn, len); if (!root) { - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); return -ENOENT; } rb_erase(&root->subdir_node, &parent->subdir); @@ -582,7 +605,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) de = next; continue; } - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); proc_entry_rundown(de); next = de->parent; @@ -593,7 +616,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) break; pde_put(de); - spin_lock(&proc_subdir_lock); + write_lock(&proc_subdir_lock); de = next; } pde_put(root); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index afe232b9df6e..bd95b9fdebb0 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -422,6 +422,10 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; PROC_I(inode)->pde = de; + if (is_empty_pde(de)) { + make_empty_dir_inode(inode); + return inode; + } if (de->mode) { inode->i_mode = de->mode; inode->i_uid = de->uid; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index c835b94c0cd3..aa2781095bd1 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -191,6 +191,12 @@ static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) } extern void pde_put(struct proc_dir_entry *); +static inline bool is_empty_pde(const struct proc_dir_entry *pde) +{ + return S_ISDIR(pde->mode) && !pde->proc_iops; +} +struct proc_dir_entry *proc_create_mount_point(const char *name); + /* * inode.c */ diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 91a4e6426321..92e6726f6e37 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen) roundup(sizeof(CORE_STR), 4)) + roundup(sizeof(struct elf_prstatus), 4) + roundup(sizeof(struct elf_prpsinfo), 4) + - roundup(sizeof(struct task_struct), 4); + roundup(arch_task_struct_size, 4); *elf_buflen = PAGE_ALIGN(*elf_buflen); return size + *elf_buflen; } @@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) /* set up the task structure */ notes[2].name = CORE_STR; notes[2].type = NT_TASKSTRUCT; - notes[2].datasz = sizeof(struct task_struct); + notes[2].datasz = arch_task_struct_size; notes[2].data = current; nhdr->p_filesz += notesize(¬es[2]); diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index d4a35746cab9..f8595e8b5cd0 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -64,7 +64,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) if (file) { seq_pad(m, ' '); - seq_path(m, &file->f_path, ""); + seq_file_path(m, file, ""); } seq_putc(m, '\n'); diff --git a/fs/proc/page.c b/fs/proc/page.c index 7eee2d8b97d9..93484034a03d 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -9,12 +9,16 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/hugetlb.h> +#include <linux/memcontrol.h> +#include <linux/mmu_notifier.h> +#include <linux/page_idle.h> #include <linux/kernel-page-flags.h> #include <asm/uaccess.h> #include "internal.h" #define KPMSIZE sizeof(u64) #define KPMMASK (KPMSIZE - 1) +#define KPMBITS (KPMSIZE * BITS_PER_BYTE) /* /proc/kpagecount - an array exposing page counts * @@ -54,6 +58,8 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, pfn++; out++; count -= KPMSIZE; + + cond_resched(); } *ppos += (char __user *)out - buf; @@ -146,6 +152,9 @@ u64 stable_page_flags(struct page *page) if (PageBalloon(page)) u |= 1 << KPF_BALLOON; + if (page_is_idle(page)) + u |= 1 << KPF_IDLE; + u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); @@ -212,6 +221,8 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, pfn++; out++; count -= KPMSIZE; + + cond_resched(); } *ppos += (char __user *)out - buf; @@ -225,10 +236,64 @@ static const struct file_operations proc_kpageflags_operations = { .read = kpageflags_read, }; +#ifdef CONFIG_MEMCG +static ssize_t kpagecgroup_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 __user *out = (u64 __user *)buf; + struct page *ppage; + unsigned long src = *ppos; + unsigned long pfn; + ssize_t ret = 0; + u64 ino; + + pfn = src / KPMSIZE; + count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); + if (src & KPMMASK || count & KPMMASK) + return -EINVAL; + + while (count > 0) { + if (pfn_valid(pfn)) + ppage = pfn_to_page(pfn); + else + ppage = NULL; + + if (ppage) + ino = page_cgroup_ino(ppage); + else + ino = 0; + + if (put_user(ino, out)) { + ret = -EFAULT; + break; + } + + pfn++; + out++; + count -= KPMSIZE; + + cond_resched(); + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static const struct file_operations proc_kpagecgroup_operations = { + .llseek = mem_lseek, + .read = kpagecgroup_read, +}; +#endif /* CONFIG_MEMCG */ + static int __init proc_page_init(void) { proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations); proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); +#ifdef CONFIG_MEMCG + proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations); +#endif return 0; } fs_initcall(proc_page_init); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index fea2561d773b..fdda62e6115e 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -19,6 +19,28 @@ static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; +/* Support for permanently empty directories */ + +struct ctl_table sysctl_mount_point[] = { + { } +}; + +static bool is_empty_dir(struct ctl_table_header *head) +{ + return head->ctl_table[0].child == sysctl_mount_point; +} + +static void set_empty_dir(struct ctl_dir *dir) +{ + dir->header.ctl_table[0].child = sysctl_mount_point; +} + +static void clear_empty_dir(struct ctl_dir *dir) + +{ + dir->header.ctl_table[0].child = NULL; +} + void proc_sys_poll_notify(struct ctl_table_poll *poll) { if (!poll) @@ -187,6 +209,17 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) struct ctl_table *entry; int err; + /* Is this a permanently empty directory? */ + if (is_empty_dir(&dir->header)) + return -EROFS; + + /* Am I creating a permanently empty directory? */ + if (header->ctl_table == sysctl_mount_point) { + if (!RB_EMPTY_ROOT(&dir->root)) + return -EINVAL; + set_empty_dir(dir); + } + dir->header.nreg++; header->parent = dir; err = insert_links(header); @@ -202,6 +235,8 @@ fail: erase_header(header); put_links(header); fail_links: + if (header->ctl_table == sysctl_mount_point) + clear_empty_dir(dir); header->parent = NULL; drop_sysctl_table(&dir->header); return err; @@ -419,6 +454,8 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode->i_mode |= S_IFDIR; inode->i_op = &proc_sys_dir_operations; inode->i_fop = &proc_sys_dir_file_operations; + if (is_empty_dir(head)) + make_empty_dir_inode(inode); } out: return inode; diff --git a/fs/proc/root.c b/fs/proc/root.c index b7fa4bfe896a..361ab4ee42fc 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -112,9 +112,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, ns = task_active_pid_ns(current); options = data; - if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) - return ERR_PTR(-EPERM); - /* Does the mounter have privilege over the pid namespace? */ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -137,6 +134,8 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, } sb->s_flags |= MS_ACTIVE; + /* User space would break if executables appear on proc */ + sb->s_iflags |= SB_I_NOEXEC; } return dget(sb->s_root); @@ -159,7 +158,7 @@ static struct file_system_type proc_fs_type = { .name = "proc", .mount = proc_mount, .kill_sb = proc_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT, }; void __init proc_root_init(void) @@ -182,10 +181,10 @@ void __init proc_root_init(void) #endif proc_mkdir("fs", NULL); proc_mkdir("driver", NULL); - proc_mkdir("fs/nfsd", NULL); /* somewhere for the nfsd filesystem to be mounted */ + proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */ #if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE) /* just give it a mountpoint */ - proc_mkdir("openprom", NULL); + proc_create_mount_point("openprom"); #endif proc_tty_init(); proc_mkdir("bus", NULL); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6dee68d013ff..e2d46adb54b4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -13,6 +13,7 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mmu_notifier.h> +#include <linux/page_idle.h> #include <asm/elf.h> #include <asm/uaccess.h> @@ -310,7 +311,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) */ if (file) { seq_pad(m, ' '); - seq_path(m, &file->f_path, "\n"); + seq_file_path(m, file, "\n"); goto done; } @@ -446,6 +447,7 @@ struct mem_size_stats { unsigned long anonymous_thp; unsigned long swap; u64 pss; + u64 swap_pss; }; static void smaps_account(struct mem_size_stats *mss, struct page *page, @@ -458,7 +460,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, mss->resident += size; /* Accumulate the size in pages that have been accessed. */ - if (young || PageReferenced(page)) + if (young || page_is_young(page) || PageReferenced(page)) mss->referenced += size; mapcount = page_mapcount(page); if (mapcount >= 2) { @@ -492,9 +494,20 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } else if (is_swap_pte(*pte)) { swp_entry_t swpent = pte_to_swp_entry(*pte); - if (!non_swap_entry(swpent)) + if (!non_swap_entry(swpent)) { + int mapcount; + mss->swap += PAGE_SIZE; - else if (is_migration_entry(swpent)) + mapcount = swp_swapcount(swpent); + if (mapcount >= 2) { + u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT; + + do_div(pss_delta, mapcount); + mss->swap_pss += pss_delta; + } else { + mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; + } + } else if (is_migration_entry(swpent)) page = migration_entry_to_page(swpent); } @@ -597,6 +610,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_HUGEPAGE)] = "hg", [ilog2(VM_NOHUGEPAGE)] = "nh", [ilog2(VM_MERGEABLE)] = "mg", + [ilog2(VM_UFFD_MISSING)]= "um", + [ilog2(VM_UFFD_WP)] = "uw", }; size_t i; @@ -638,6 +653,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) "Anonymous: %8lu kB\n" "AnonHugePages: %8lu kB\n" "Swap: %8lu kB\n" + "SwapPss: %8lu kB\n" "KernelPageSize: %8lu kB\n" "MMUPageSize: %8lu kB\n" "Locked: %8lu kB\n", @@ -652,6 +668,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) mss.anonymous >> 10, mss.anonymous_thp >> 10, mss.swap >> 10, + (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)), vma_kernel_pagesize(vma) >> 10, vma_mmu_pagesize(vma) >> 10, (vma->vm_flags & VM_LOCKED) ? @@ -710,23 +727,6 @@ const struct file_operations proc_tid_smaps_operations = { .release = proc_map_release, }; -/* - * We do not want to have constant page-shift bits sitting in - * pagemap entries and are about to reuse them some time soon. - * - * Here's the "migration strategy": - * 1. when the system boots these bits remain what they are, - * but a warning about future change is printed in log; - * 2. once anyone clears soft-dirty bits via clear_refs file, - * these flag is set to denote, that user is aware of the - * new API and those page-shift bits change their meaning. - * The respective warning is printed in dmesg; - * 3. In a couple of releases we will remove all the mentions - * of page-shift in pagemap entries. - */ - -static bool soft_dirty_cleared __read_mostly; - enum clear_refs_types { CLEAR_REFS_ALL = 1, CLEAR_REFS_ANON, @@ -808,6 +808,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, /* Clear accessed and referenced bits. */ pmdp_test_and_clear_young(vma, addr, pmd); + test_and_clear_page_young(page); ClearPageReferenced(page); out: spin_unlock(ptl); @@ -835,6 +836,7 @@ out: /* Clear accessed and referenced bits. */ ptep_test_and_clear_young(vma, addr, pte); + test_and_clear_page_young(page); ClearPageReferenced(page); } pte_unmap_unlock(pte - 1, ptl); @@ -887,13 +889,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) return -EINVAL; - if (type == CLEAR_REFS_SOFT_DIRTY) { - soft_dirty_cleared = true; - pr_warn_once("The pagemap bits 55-60 has changed their meaning!" - " See the linux/Documentation/vm/pagemap.txt for " - "details.\n"); - } - task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; @@ -961,36 +956,26 @@ typedef struct { struct pagemapread { int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ pagemap_entry_t *buffer; - bool v2; + bool show_pfn; }; #define PAGEMAP_WALK_SIZE (PMD_SIZE) #define PAGEMAP_WALK_MASK (PMD_MASK) -#define PM_ENTRY_BYTES sizeof(pagemap_entry_t) -#define PM_STATUS_BITS 3 -#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) -#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) -#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK) -#define PM_PSHIFT_BITS 6 -#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) -#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) -#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) -#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) -#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) -/* in "new" pagemap pshift bits are occupied with more status bits */ -#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) - -#define __PM_SOFT_DIRTY (1LL) -#define PM_PRESENT PM_STATUS(4LL) -#define PM_SWAP PM_STATUS(2LL) -#define PM_FILE PM_STATUS(1LL) -#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0) +#define PM_ENTRY_BYTES sizeof(pagemap_entry_t) +#define PM_PFRAME_BITS 55 +#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) +#define PM_SOFT_DIRTY BIT_ULL(55) +#define PM_MMAP_EXCLUSIVE BIT_ULL(56) +#define PM_FILE BIT_ULL(61) +#define PM_SWAP BIT_ULL(62) +#define PM_PRESENT BIT_ULL(63) + #define PM_END_OF_BUFFER 1 -static inline pagemap_entry_t make_pme(u64 val) +static inline pagemap_entry_t make_pme(u64 frame, u64 flags) { - return (pagemap_entry_t) { .pme = val }; + return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags }; } static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, @@ -1011,7 +996,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, while (addr < end) { struct vm_area_struct *vma = find_vma(walk->mm, addr); - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); + pagemap_entry_t pme = make_pme(0, 0); /* End of address space hole, which we mark as non-present. */ unsigned long hole_end; @@ -1031,7 +1016,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, /* Addresses in the VMA. */ if (vma->vm_flags & VM_SOFTDIRTY) - pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY); + pme = make_pme(0, PM_SOFT_DIRTY); for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { err = add_to_pagemap(addr, &pme, pm); if (err) @@ -1042,67 +1027,42 @@ out: return err; } -static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, +static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { - u64 frame, flags; + u64 frame = 0, flags = 0; struct page *page = NULL; - int flags2 = 0; if (pte_present(pte)) { - frame = pte_pfn(pte); - flags = PM_PRESENT; + if (pm->show_pfn) + frame = pte_pfn(pte); + flags |= PM_PRESENT; page = vm_normal_page(vma, addr, pte); if (pte_soft_dirty(pte)) - flags2 |= __PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY; } else if (is_swap_pte(pte)) { swp_entry_t entry; if (pte_swp_soft_dirty(pte)) - flags2 |= __PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY; entry = pte_to_swp_entry(pte); frame = swp_type(entry) | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); - flags = PM_SWAP; + flags |= PM_SWAP; if (is_migration_entry(entry)) page = migration_entry_to_page(entry); - } else { - if (vma->vm_flags & VM_SOFTDIRTY) - flags2 |= __PM_SOFT_DIRTY; - *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); - return; } if (page && !PageAnon(page)) flags |= PM_FILE; - if ((vma->vm_flags & VM_SOFTDIRTY)) - flags2 |= __PM_SOFT_DIRTY; - - *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); -} + if (page && page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; + if (vma->vm_flags & VM_SOFTDIRTY) + flags |= PM_SOFT_DIRTY; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, - pmd_t pmd, int offset, int pmd_flags2) -{ - /* - * Currently pmd for thp is always present because thp can not be - * swapped-out, migrated, or HWPOISONed (split in such cases instead.) - * This if-check is just to prepare for future implementation. - */ - if (pmd_present(pmd)) - *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) - | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); - else - *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2)); + return make_pme(frame, flags); } -#else -static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, - pmd_t pmd, int offset, int pmd_flags2) -{ -} -#endif -static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, +static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; @@ -1111,41 +1071,58 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *pte, *orig_pte; int err = 0; - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { - int pmd_flags2; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) { + u64 flags = 0, frame = 0; + pmd_t pmd = *pmdp; - if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) - pmd_flags2 = __PM_SOFT_DIRTY; - else - pmd_flags2 = 0; + if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd)) + flags |= PM_SOFT_DIRTY; + + /* + * Currently pmd for thp is always present because thp + * can not be swapped-out, migrated, or HWPOISONed + * (split in such cases instead.) + * This if-check is just to prepare for future implementation. + */ + if (pmd_present(pmd)) { + struct page *page = pmd_page(pmd); + + if (page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; + + flags |= PM_PRESENT; + if (pm->show_pfn) + frame = pmd_pfn(pmd) + + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + } for (; addr != end; addr += PAGE_SIZE) { - unsigned long offset; - pagemap_entry_t pme; + pagemap_entry_t pme = make_pme(frame, flags); - offset = (addr & ~PAGEMAP_WALK_MASK) >> - PAGE_SHIFT; - thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); err = add_to_pagemap(addr, &pme, pm); if (err) break; + if (pm->show_pfn && (flags & PM_PRESENT)) + frame++; } spin_unlock(ptl); return err; } - if (pmd_trans_unstable(pmd)) + if (pmd_trans_unstable(pmdp)) return 0; +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* * We can assume that @vma always points to a valid one and @end never * goes beyond vma->vm_end. */ - orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); for (; addr < end; pte++, addr += PAGE_SIZE) { pagemap_entry_t pme; - pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); + pme = pte_to_pagemap_entry(pm, vma, addr, *pte); err = add_to_pagemap(addr, &pme, pm); if (err) break; @@ -1158,40 +1135,44 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } #ifdef CONFIG_HUGETLB_PAGE -static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, - pte_t pte, int offset, int flags2) -{ - if (pte_present(pte)) - *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) | - PM_STATUS2(pm->v2, flags2) | - PM_PRESENT); - else - *pme = make_pme(PM_NOT_PRESENT(pm->v2) | - PM_STATUS2(pm->v2, flags2)); -} - /* This function walks within one hugetlb entry in the single call */ -static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, +static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct pagemapread *pm = walk->private; struct vm_area_struct *vma = walk->vma; + u64 flags = 0, frame = 0; int err = 0; - int flags2; - pagemap_entry_t pme; + pte_t pte; if (vma->vm_flags & VM_SOFTDIRTY) - flags2 = __PM_SOFT_DIRTY; - else - flags2 = 0; + flags |= PM_SOFT_DIRTY; + + pte = huge_ptep_get(ptep); + if (pte_present(pte)) { + struct page *page = pte_page(pte); + + if (!PageAnon(page)) + flags |= PM_FILE; + + if (page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; + + flags |= PM_PRESENT; + if (pm->show_pfn) + frame = pte_pfn(pte) + + ((addr & ~hmask) >> PAGE_SHIFT); + } for (; addr != end; addr += PAGE_SIZE) { - int offset = (addr & ~hmask) >> PAGE_SHIFT; - huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2); + pagemap_entry_t pme = make_pme(frame, flags); + err = add_to_pagemap(addr, &pme, pm); if (err) return err; + if (pm->show_pfn && (flags & PM_PRESENT)) + frame++; } cond_resched(); @@ -1209,7 +1190,9 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, * Bits 0-54 page frame number (PFN) if present * Bits 0-4 swap type if swapped * Bits 5-54 swap offset if swapped - * Bits 55-60 page shift (page size = 1<<page shift) + * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt) + * Bit 56 page exclusively mapped + * Bits 57-60 zero * Bit 61 page is file-page or shared-anon * Bit 62 page swapped * Bit 63 page present @@ -1227,42 +1210,37 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, static ssize_t pagemap_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = get_proc_task(file_inode(file)); - struct mm_struct *mm; + struct mm_struct *mm = file->private_data; struct pagemapread pm; - int ret = -ESRCH; struct mm_walk pagemap_walk = {}; unsigned long src; unsigned long svpfn; unsigned long start_vaddr; unsigned long end_vaddr; - int copied = 0; + int ret = 0, copied = 0; - if (!task) + if (!mm || !atomic_inc_not_zero(&mm->mm_users)) goto out; ret = -EINVAL; /* file position must be aligned */ if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) - goto out_task; + goto out_mm; ret = 0; if (!count) - goto out_task; + goto out_mm; + + /* do not disclose physical addresses: attack vector */ + pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); - pm.v2 = soft_dirty_cleared; pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); ret = -ENOMEM; if (!pm.buffer) - goto out_task; - - mm = mm_access(task, PTRACE_MODE_READ); - ret = PTR_ERR(mm); - if (!mm || IS_ERR(mm)) - goto out_free; + goto out_mm; - pagemap_walk.pmd_entry = pagemap_pte_range; + pagemap_walk.pmd_entry = pagemap_pmd_range; pagemap_walk.pte_hole = pagemap_pte_hole; #ifdef CONFIG_HUGETLB_PAGE pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; @@ -1273,10 +1251,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, src = *ppos; svpfn = src / PM_ENTRY_BYTES; start_vaddr = svpfn << PAGE_SHIFT; - end_vaddr = TASK_SIZE_OF(task); + end_vaddr = mm->task_size; /* watch out for wraparound */ - if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) + if (svpfn > mm->task_size >> PAGE_SHIFT) start_vaddr = end_vaddr; /* @@ -1303,7 +1281,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, len = min(count, PM_ENTRY_BYTES * pm.pos); if (copy_to_user(buf, pm.buffer, len)) { ret = -EFAULT; - goto out_mm; + goto out_free; } copied += len; buf += len; @@ -1313,24 +1291,31 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!ret || ret == PM_END_OF_BUFFER) ret = copied; -out_mm: - mmput(mm); out_free: kfree(pm.buffer); -out_task: - put_task_struct(task); +out_mm: + mmput(mm); out: return ret; } static int pagemap_open(struct inode *inode, struct file *file) { - /* do not disclose physical addresses: attack vector */ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " - "to stop being page-shift some time soon. See the " - "linux/Documentation/vm/pagemap.txt for details.\n"); + struct mm_struct *mm; + + mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(mm)) + return PTR_ERR(mm); + file->private_data = mm; + return 0; +} + +static int pagemap_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + + if (mm) + mmdrop(mm); return 0; } @@ -1338,6 +1323,7 @@ const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, .open = pagemap_open, + .release = pagemap_release, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ @@ -1509,7 +1495,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) if (file) { seq_puts(m, " file="); - seq_path(m, &file->f_path, "\n\t= "); + seq_file_path(m, file, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_puts(m, " heap"); } else { diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 599ec2e20104..e0d64c92e4f6 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -180,7 +180,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, if (file) { seq_pad(m, ' '); - seq_path(m, &file->f_path, ""); + seq_file_path(m, file, ""); } else if (mm) { pid_t tid = pid_of_stack(priv, vma, is_pid); diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 8db932da4009..8ebd9a334085 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -17,7 +17,8 @@ static unsigned mounts_poll(struct file *file, poll_table *wait) { - struct proc_mounts *p = proc_mounts(file->private_data); + struct seq_file *m = file->private_data; + struct proc_mounts *p = m->private; struct mnt_namespace *ns = p->ns; unsigned res = POLLIN | POLLRDNORM; int event; @@ -25,8 +26,8 @@ static unsigned mounts_poll(struct file *file, poll_table *wait) poll_wait(file, &p->ns->poll, wait); event = ACCESS_ONCE(ns->event); - if (p->m.poll_event != event) { - p->m.poll_event = event; + if (m->poll_event != event) { + m->poll_event = event; res |= POLLERR | POLLPRI; } @@ -92,7 +93,7 @@ static void show_type(struct seq_file *m, struct super_block *sb) static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); int err = 0; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; @@ -126,7 +127,7 @@ out: static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); struct super_block *sb = mnt->mnt_sb; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; @@ -186,7 +187,7 @@ out: static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; struct super_block *sb = mnt_path.dentry->d_sb; @@ -236,6 +237,7 @@ static int mounts_open_common(struct inode *inode, struct file *file, struct mnt_namespace *ns = NULL; struct path root; struct proc_mounts *p; + struct seq_file *m; int ret = -EINVAL; if (!task) @@ -260,26 +262,21 @@ static int mounts_open_common(struct inode *inode, struct file *file, task_unlock(task); put_task_struct(task); - ret = -ENOMEM; - p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL); - if (!p) + ret = seq_open_private(file, &mounts_op, sizeof(struct proc_mounts)); + if (ret) goto err_put_path; - file->private_data = &p->m; - ret = seq_open(file, &mounts_op); - if (ret) - goto err_free; + m = file->private_data; + m->poll_event = ns->event; + p = m->private; p->ns = ns; p->root = root; - p->m.poll_event = ns->event; p->show = show; p->cached_event = ~0ULL; return 0; - err_free: - kfree(p); err_put_path: path_put(&root); err_put_ns: @@ -290,10 +287,11 @@ static int mounts_open_common(struct inode *inode, struct file *file, static int mounts_release(struct inode *inode, struct file *file) { - struct proc_mounts *p = proc_mounts(file->private_data); + struct seq_file *m = file->private_data; + struct proc_mounts *p = m->private; path_put(&p->root); put_mnt_ns(p->ns); - return seq_release(inode, file); + return seq_release_private(inode, file); } static int mounts_open(struct inode *inode, struct file *file) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index dc43b5f29305..3adcc4669fac 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -461,22 +461,18 @@ static struct file_system_type pstore_fs_type = { .kill_sb = pstore_kill_sb, }; -static struct kobject *pstore_kobj; - static int __init init_pstore_fs(void) { - int err = 0; + int err; /* Create a convenient mount point for people to access pstore */ - pstore_kobj = kobject_create_and_add("pstore", fs_kobj); - if (!pstore_kobj) { - err = -ENOMEM; + err = sysfs_create_mount_point(fs_kobj, "pstore"); + if (err) goto out; - } err = register_filesystem(&pstore_fs_type); if (err < 0) - kobject_put(pstore_kobj); + sysfs_remove_mount_point(fs_kobj, "pstore"); out: return err; diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index 8d64bb5366bf..e1f37278cf97 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -32,11 +32,6 @@ static struct page *qnx6_get_page(struct inode *dir, unsigned long n) return page; } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - static unsigned last_entry(struct inode *inode, unsigned long page_nr) { unsigned long last_byte = inode->i_size; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 20d1f74561cf..ef0d64b2a6d9 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -247,7 +247,7 @@ struct dqstats dqstats; EXPORT_SYMBOL(dqstats); static qsize_t inode_get_rsv_space(struct inode *inode); -static void __dquot_initialize(struct inode *inode, int type); +static int __dquot_initialize(struct inode *inode, int type); static inline unsigned int hashfn(const struct super_block *sb, struct kqid qid) @@ -832,16 +832,17 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type) struct dquot *dqget(struct super_block *sb, struct kqid qid) { unsigned int hashent = hashfn(sb, qid); - struct dquot *dquot = NULL, *empty = NULL; + struct dquot *dquot, *empty = NULL; if (!sb_has_quota_active(sb, qid.type)) - return NULL; + return ERR_PTR(-ESRCH); we_slept: spin_lock(&dq_list_lock); spin_lock(&dq_state_lock); if (!sb_has_quota_active(sb, qid.type)) { spin_unlock(&dq_state_lock); spin_unlock(&dq_list_lock); + dquot = ERR_PTR(-ESRCH); goto out; } spin_unlock(&dq_state_lock); @@ -876,11 +877,15 @@ we_slept: * already finished or it will be canceled due to dq_count > 1 test */ wait_on_dquot(dquot); /* Read the dquot / allocate space in quota file */ - if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && - sb->dq_op->acquire_dquot(dquot) < 0) { - dqput(dquot); - dquot = NULL; - goto out; + if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { + int err; + + err = sb->dq_op->acquire_dquot(dquot); + if (err < 0) { + dqput(dquot); + dquot = ERR_PTR(err); + goto out; + } } #ifdef CONFIG_QUOTA_DEBUG BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */ @@ -923,7 +928,7 @@ static void add_dquot_ref(struct super_block *sb, int type) int reserved = 0; #endif - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || @@ -934,7 +939,7 @@ static void add_dquot_ref(struct super_block *sb, int type) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); #ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) @@ -946,15 +951,15 @@ static void add_dquot_ref(struct super_block *sb, int type) /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the - * inode_sb_list_lock We cannot iput the inode now as we can be + * s_inode_list_lock. We cannot iput the inode now as we can be * holding the last reference and we cannot iput it under - * inode_sb_list_lock. So we keep the reference and iput it + * s_inode_list_lock. So we keep the reference and iput it * later. */ old_inode = inode; - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); iput(old_inode); #ifdef CONFIG_QUOTA_DEBUG @@ -1023,7 +1028,7 @@ static void remove_dquot_ref(struct super_block *sb, int type, struct inode *inode; int reserved = 0; - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { /* * We have to scan also I_NEW inodes because they can already @@ -1039,7 +1044,7 @@ static void remove_dquot_ref(struct super_block *sb, int type, } spin_unlock(&dq_data_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); #ifdef CONFIG_QUOTA_DEBUG if (reserved) { printk(KERN_WARNING "VFS (%s): Writes happened after quota" @@ -1390,15 +1395,16 @@ static int dquot_active(const struct inode *inode) * It is better to call this function outside of any transaction as it * might need a lot of space in journal for dquot structure allocation. */ -static void __dquot_initialize(struct inode *inode, int type) +static int __dquot_initialize(struct inode *inode, int type) { int cnt, init_needed = 0; struct dquot **dquots, *got[MAXQUOTAS]; struct super_block *sb = inode->i_sb; qsize_t rsv; + int ret = 0; if (!dquot_active(inode)) - return; + return 0; dquots = i_dquot(inode); @@ -1407,6 +1413,7 @@ static void __dquot_initialize(struct inode *inode, int type) struct kqid qid; kprojid_t projid; int rc; + struct dquot *dquot; got[cnt] = NULL; if (type != -1 && cnt != type) @@ -1438,16 +1445,25 @@ static void __dquot_initialize(struct inode *inode, int type) qid = make_kqid_projid(projid); break; } - got[cnt] = dqget(sb, qid); + dquot = dqget(sb, qid); + if (IS_ERR(dquot)) { + /* We raced with somebody turning quotas off... */ + if (PTR_ERR(dquot) != -ESRCH) { + ret = PTR_ERR(dquot); + goto out_put; + } + dquot = NULL; + } + got[cnt] = dquot; } /* All required i_dquot has been initialized */ if (!init_needed) - return; + return 0; spin_lock(&dq_data_lock); if (IS_NOQUOTA(inode)) - goto out_err; + goto out_lock; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; @@ -1469,15 +1485,18 @@ static void __dquot_initialize(struct inode *inode, int type) dquot_resv_space(dquots[cnt], rsv); } } -out_err: +out_lock: spin_unlock(&dq_data_lock); +out_put: /* Drop unused references */ dqput_all(got); + + return ret; } -void dquot_initialize(struct inode *inode) +int dquot_initialize(struct inode *inode) { - __dquot_initialize(inode, -1); + return __dquot_initialize(inode, -1); } EXPORT_SYMBOL(dquot_initialize); @@ -1961,18 +1980,37 @@ EXPORT_SYMBOL(__dquot_transfer); int dquot_transfer(struct inode *inode, struct iattr *iattr) { struct dquot *transfer_to[MAXQUOTAS] = {}; + struct dquot *dquot; struct super_block *sb = inode->i_sb; int ret; if (!dquot_active(inode)) return 0; - if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) - transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(iattr->ia_uid)); - if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid)) - transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(iattr->ia_gid)); - + if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)){ + dquot = dqget(sb, make_kqid_uid(iattr->ia_uid)); + if (IS_ERR(dquot)) { + if (PTR_ERR(dquot) != -ESRCH) { + ret = PTR_ERR(dquot); + goto out_put; + } + dquot = NULL; + } + transfer_to[USRQUOTA] = dquot; + } + if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid)){ + dquot = dqget(sb, make_kqid_gid(iattr->ia_gid)); + if (IS_ERR(dquot)) { + if (PTR_ERR(dquot) != -ESRCH) { + ret = PTR_ERR(dquot); + goto out_put; + } + dquot = NULL; + } + transfer_to[GRPQUOTA] = dquot; + } ret = __dquot_transfer(inode, transfer_to); +out_put: dqput_all(transfer_to); return ret; } @@ -2518,8 +2556,8 @@ int dquot_get_dqblk(struct super_block *sb, struct kqid qid, struct dquot *dquot; dquot = dqget(sb, qid); - if (!dquot) - return -ESRCH; + if (IS_ERR(dquot)) + return PTR_ERR(dquot); do_get_dqblk(dquot, di); dqput(dquot); @@ -2631,8 +2669,8 @@ int dquot_set_dqblk(struct super_block *sb, struct kqid qid, int rc; dquot = dqget(sb, qid); - if (!dquot) { - rc = -ESRCH; + if (IS_ERR(dquot)) { + rc = PTR_ERR(dquot); goto out; } rc = do_set_dqblk(dquot, di); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 86ded7375c21..3746367098fd 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -141,9 +141,9 @@ static int quota_getinfo(struct super_block *sb, int type, void __user *addr) if (tstate->flags & QCI_ROOT_SQUASH) uinfo.dqi_flags |= DQF_ROOT_SQUASH; uinfo.dqi_valid = IIF_ALL; - if (!ret && copy_to_user(addr, &uinfo, sizeof(uinfo))) + if (copy_to_user(addr, &uinfo, sizeof(uinfo))) return -EFAULT; - return ret; + return 0; } static int quota_setinfo(struct super_block *sb, int type, void __user *addr) diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index f6f2fbad9777..3d8e7e671d5b 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -3319,8 +3319,11 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) /* must be turned off for recursive notify_change calls */ ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); - if (is_quota_modification(inode, attr)) - dquot_initialize(inode); + if (is_quota_modification(inode, attr)) { + error = dquot_initialize(inode); + if (error) + return error; + } reiserfs_write_lock(inode->i_sb); if (attr->ia_valid & ATTR_SIZE) { /* diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index b55a074653d7..5f1c9c29eb8c 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -613,8 +613,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode) * we have to set uid and gid here */ inode_init_owner(inode, dir, mode); - dquot_initialize(inode); - return 0; + return dquot_initialize(inode); } static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, @@ -633,12 +632,18 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod struct reiserfs_transaction_handle th; struct reiserfs_security_handle security; - dquot_initialize(dir); + retval = dquot_initialize(dir); + if (retval) + return retval; if (!(inode = new_inode(dir->i_sb))) { return -ENOMEM; } - new_inode_init(inode, dir, mode); + retval = new_inode_init(inode, dir, mode); + if (retval) { + drop_new_inode(inode); + return retval; + } jbegin_count += reiserfs_cache_default_acl(dir); retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); @@ -710,12 +715,18 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode if (!new_valid_dev(rdev)) return -EINVAL; - dquot_initialize(dir); + retval = dquot_initialize(dir); + if (retval) + return retval; if (!(inode = new_inode(dir->i_sb))) { return -ENOMEM; } - new_inode_init(inode, dir, mode); + retval = new_inode_init(inode, dir, mode); + if (retval) { + drop_new_inode(inode); + return retval; + } jbegin_count += reiserfs_cache_default_acl(dir); retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); @@ -787,7 +798,9 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); - dquot_initialize(dir); + retval = dquot_initialize(dir); + if (retval) + return retval; #ifdef DISPLACE_NEW_PACKING_LOCALITIES /* @@ -800,7 +813,11 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode if (!(inode = new_inode(dir->i_sb))) { return -ENOMEM; } - new_inode_init(inode, dir, mode); + retval = new_inode_init(inode, dir, mode); + if (retval) { + drop_new_inode(inode); + return retval; + } jbegin_count += reiserfs_cache_default_acl(dir); retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); @@ -899,7 +916,9 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); - dquot_initialize(dir); + retval = dquot_initialize(dir); + if (retval) + return retval; reiserfs_write_lock(dir->i_sb); retval = journal_begin(&th, dir->i_sb, jbegin_count); @@ -985,7 +1004,9 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) int jbegin_count; unsigned long savelink; - dquot_initialize(dir); + retval = dquot_initialize(dir); + if (retval) + return retval; inode = d_inode(dentry); @@ -1095,12 +1116,18 @@ static int reiserfs_symlink(struct inode *parent_dir, 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) + REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb)); - dquot_initialize(parent_dir); + retval = dquot_initialize(parent_dir); + if (retval) + return retval; if (!(inode = new_inode(parent_dir->i_sb))) { return -ENOMEM; } - new_inode_init(inode, parent_dir, mode); + retval = new_inode_init(inode, parent_dir, mode); + if (retval) { + drop_new_inode(inode); + return retval; + } retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name, &security); @@ -1184,7 +1211,9 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, JOURNAL_PER_BALANCE_CNT * 3 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); - dquot_initialize(dir); + retval = dquot_initialize(dir); + if (retval) + return retval; reiserfs_write_lock(dir->i_sb); if (inode->i_nlink >= REISERFS_LINK_MAX) { @@ -1308,8 +1337,12 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, JOURNAL_PER_BALANCE_CNT * 3 + 5 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb); - dquot_initialize(old_dir); - dquot_initialize(new_dir); + retval = dquot_initialize(old_dir); + if (retval) + return retval; + retval = dquot_initialize(new_dir); + if (retval) + return retval; old_inode = d_inode(old_dentry); new_dentry_inode = d_inode(new_dentry); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 0e4cf728126f..4a62fe8cc3bf 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -714,18 +714,20 @@ static int reiserfs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",acl"); if (REISERFS_SB(s)->s_jdev) - seq_printf(seq, ",jdev=%s", REISERFS_SB(s)->s_jdev); + seq_show_option(seq, "jdev", REISERFS_SB(s)->s_jdev); if (journal->j_max_commit_age != journal->j_default_max_commit_age) seq_printf(seq, ",commit=%d", journal->j_max_commit_age); #ifdef CONFIG_QUOTA if (REISERFS_SB(s)->s_qf_names[USRQUOTA]) - seq_printf(seq, ",usrjquota=%s", REISERFS_SB(s)->s_qf_names[USRQUOTA]); + seq_show_option(seq, "usrjquota", + REISERFS_SB(s)->s_qf_names[USRQUOTA]); else if (opts & (1 << REISERFS_USRQUOTA)) seq_puts(seq, ",usrquota"); if (REISERFS_SB(s)->s_qf_names[GRPQUOTA]) - seq_printf(seq, ",grpjquota=%s", REISERFS_SB(s)->s_qf_names[GRPQUOTA]); + seq_show_option(seq, "grpjquota", + REISERFS_SB(s)->s_qf_names[GRPQUOTA]); else if (opts & (1 << REISERFS_GRPQUOTA)) seq_puts(seq, ",grpquota"); if (REISERFS_SB(s)->s_jquota_fmt) { diff --git a/fs/seq_file.c b/fs/seq_file.c index 555f82155be8..263b125dbcf4 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -12,6 +12,7 @@ #include <linux/slab.h> #include <linux/cred.h> #include <linux/mm.h> +#include <linux/printk.h> #include <asm/uaccess.h> #include <asm/page.h> @@ -48,18 +49,21 @@ static void *seq_buf_alloc(unsigned long size) * ERR_PTR(error). In the end of sequence they return %NULL. ->show() * returns 0 in case of success and negative number in case of error. * Returning SEQ_SKIP means "discard this element and move on". + * Note: seq_open() will allocate a struct seq_file and store its + * pointer in @file->private_data. This pointer should not be modified. */ int seq_open(struct file *file, const struct seq_operations *op) { - struct seq_file *p = file->private_data; + struct seq_file *p; + + WARN_ON(file->private_data); + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + file->private_data = p; - if (!p) { - p = kmalloc(sizeof(*p), GFP_KERNEL); - if (!p) - return -ENOMEM; - file->private_data = p; - } - memset(p, 0, sizeof(*p)); mutex_init(&p->lock); p->op = op; #ifdef CONFIG_USER_NS @@ -487,6 +491,20 @@ int seq_path(struct seq_file *m, const struct path *path, const char *esc) } EXPORT_SYMBOL(seq_path); +/** + * seq_file_path - seq_file interface to print a pathname of a file + * @m: the seq_file handle + * @file: the struct file to print + * @esc: set of characters to escape in the output + * + * return the absolute path to the file. + */ +int seq_file_path(struct seq_file *m, struct file *file, const char *esc) +{ + return seq_path(m, &file->f_path, esc); +} +EXPORT_SYMBOL(seq_file_path); + /* * Same as seq_path, but relative to supplied root. */ @@ -538,6 +556,7 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc) return res; } +EXPORT_SYMBOL(seq_dentry); static void *single_start(struct seq_file *p, loff_t *pos) { @@ -755,6 +774,47 @@ void seq_pad(struct seq_file *m, char c) } EXPORT_SYMBOL(seq_pad); +/* A complete analogue of print_hex_dump() */ +void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, + int rowsize, int groupsize, const void *buf, size_t len, + bool ascii) +{ + const u8 *ptr = buf; + int i, linelen, remaining = len; + int ret; + + if (rowsize != 16 && rowsize != 32) + rowsize = 16; + + for (i = 0; i < len && !seq_has_overflowed(m); i += rowsize) { + linelen = min(remaining, rowsize); + remaining -= rowsize; + + switch (prefix_type) { + case DUMP_PREFIX_ADDRESS: + seq_printf(m, "%s%p: ", prefix_str, ptr + i); + break; + case DUMP_PREFIX_OFFSET: + seq_printf(m, "%s%.8x: ", prefix_str, i); + break; + default: + seq_printf(m, "%s", prefix_str); + break; + } + + ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize, + m->buf + m->count, m->size - m->count, + ascii); + if (ret >= m->size - m->count) { + seq_set_overflow(m); + } else { + m->count += ret; + seq_putc(m, '\n'); + } + } +} +EXPORT_SYMBOL(seq_hex_dump); + struct list_head *seq_list_start(struct list_head *head, loff_t pos) { struct list_head *lh; diff --git a/fs/signalfd.c b/fs/signalfd.c index 7e412ad74836..270221fcef42 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -121,8 +121,9 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, * Other callers might not initialize the si_lsb field, * so check explicitly for the right codes here. */ - if (kinfo->si_code == BUS_MCEERR_AR || - kinfo->si_code == BUS_MCEERR_AO) + if (kinfo->si_signo == SIGBUS && + (kinfo->si_code == BUS_MCEERR_AR || + kinfo->si_code == BUS_MCEERR_AO)) err |= __put_user((short) kinfo->si_addr_lsb, &uinfo->ssi_addr_lsb); #endif diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h index 73588e7700ed..d09fcd6fb85d 100644 --- a/fs/squashfs/squashfs_fs_i.h +++ b/fs/squashfs/squashfs_fs_i.h @@ -49,6 +49,6 @@ struct squashfs_inode_info { static inline struct squashfs_inode_info *squashfs_i(struct inode *inode) { - return list_entry(inode, struct squashfs_inode_info, vfs_inode); + return container_of(inode, struct squashfs_inode_info, vfs_inode); } #endif diff --git a/fs/super.c b/fs/super.c index 928c20f47af9..954aeb80e202 100644 --- a/fs/super.c +++ b/fs/super.c @@ -135,6 +135,24 @@ static unsigned long super_cache_count(struct shrinker *shrink, return total_objects; } +static void destroy_super_work(struct work_struct *work) +{ + struct super_block *s = container_of(work, struct super_block, + destroy_work); + int i; + + for (i = 0; i < SB_FREEZE_LEVELS; i++) + percpu_free_rwsem(&s->s_writers.rw_sem[i]); + kfree(s); +} + +static void destroy_super_rcu(struct rcu_head *head) +{ + struct super_block *s = container_of(head, struct super_block, rcu); + INIT_WORK(&s->destroy_work, destroy_super_work); + schedule_work(&s->destroy_work); +} + /** * destroy_super - frees a superblock * @s: superblock to free @@ -143,16 +161,13 @@ static unsigned long super_cache_count(struct shrinker *shrink, */ static void destroy_super(struct super_block *s) { - int i; list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); - for (i = 0; i < SB_FREEZE_LEVELS; i++) - percpu_counter_destroy(&s->s_writers.counter[i]); security_sb_free(s); WARN_ON(!list_empty(&s->s_mounts)); kfree(s->s_subtype); kfree(s->s_options); - kfree_rcu(s, rcu); + call_rcu(&s->rcu, destroy_super_rcu); } /** @@ -178,19 +193,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) goto fail; for (i = 0; i < SB_FREEZE_LEVELS; i++) { - if (percpu_counter_init(&s->s_writers.counter[i], 0, - GFP_KERNEL) < 0) + if (__percpu_init_rwsem(&s->s_writers.rw_sem[i], + sb_writers_name[i], + &type->s_writers_key[i])) goto fail; - lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], - &type->s_writers_key[i], 0); } - init_waitqueue_head(&s->s_writers.wait); init_waitqueue_head(&s->s_writers.wait_unfrozen); s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); + mutex_init(&s->s_sync_lock); INIT_LIST_HEAD(&s->s_inodes); + spin_lock_init(&s->s_inode_list_lock); if (list_lru_init_memcg(&s->s_dentry_lru)) goto fail; @@ -399,7 +414,7 @@ void generic_shutdown_super(struct super_block *sb) sync_filesystem(sb); sb->s_flags &= ~MS_ACTIVE; - fsnotify_unmount_inodes(&sb->s_inodes); + fsnotify_unmount_inodes(sb); evict_inodes(sb); @@ -842,7 +857,7 @@ int get_anon_bdev(dev_t *p) else if (error) return -EAGAIN; - if (dev == (1 << MINORBITS)) { + if (dev >= (1 << MINORBITS)) { spin_lock(&unnamed_dev_lock); ida_remove(&unnamed_dev_ida, dev); if (unnamed_dev_start > dev) @@ -1146,72 +1161,46 @@ out: */ void __sb_end_write(struct super_block *sb, int level) { - percpu_counter_dec(&sb->s_writers.counter[level-1]); - /* - * Make sure s_writers are updated before we wake up waiters in - * freeze_super(). - */ - smp_mb(); - if (waitqueue_active(&sb->s_writers.wait)) - wake_up(&sb->s_writers.wait); - rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_); + percpu_up_read(sb->s_writers.rw_sem + level-1); } EXPORT_SYMBOL(__sb_end_write); -#ifdef CONFIG_LOCKDEP -/* - * We want lockdep to tell us about possible deadlocks with freezing but - * it's it bit tricky to properly instrument it. Getting a freeze protection - * works as getting a read lock but there are subtle problems. XFS for example - * gets freeze protection on internal level twice in some cases, which is OK - * only because we already hold a freeze protection also on higher level. Due - * to these cases we have to tell lockdep we are doing trylock when we - * already hold a freeze protection for a higher freeze level. - */ -static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock, - unsigned long ip) -{ - int i; - - if (!trylock) { - for (i = 0; i < level - 1; i++) - if (lock_is_held(&sb->s_writers.lock_map[i])) { - trylock = true; - break; - } - } - rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip); -} -#endif - /* * This is an internal function, please use sb_start_{write,pagefault,intwrite} * instead. */ int __sb_start_write(struct super_block *sb, int level, bool wait) { -retry: - if (unlikely(sb->s_writers.frozen >= level)) { - if (!wait) - return 0; - wait_event(sb->s_writers.wait_unfrozen, - sb->s_writers.frozen < level); - } + bool force_trylock = false; + int ret = 1; #ifdef CONFIG_LOCKDEP - acquire_freeze_lock(sb, level, !wait, _RET_IP_); -#endif - percpu_counter_inc(&sb->s_writers.counter[level-1]); /* - * Make sure counter is updated before we check for frozen. - * freeze_super() first sets frozen and then checks the counter. + * We want lockdep to tell us about possible deadlocks with freezing + * but it's it bit tricky to properly instrument it. Getting a freeze + * protection works as getting a read lock but there are subtle + * problems. XFS for example gets freeze protection on internal level + * twice in some cases, which is OK only because we already hold a + * freeze protection also on higher level. Due to these cases we have + * to use wait == F (trylock mode) which must not fail. */ - smp_mb(); - if (unlikely(sb->s_writers.frozen >= level)) { - __sb_end_write(sb, level); - goto retry; + if (wait) { + int i; + + for (i = 0; i < level - 1; i++) + if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) { + force_trylock = true; + break; + } } - return 1; +#endif + if (wait && !force_trylock) + percpu_down_read(sb->s_writers.rw_sem + level-1); + else + ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1); + + WARN_ON(force_trylock & !ret); + return ret; } EXPORT_SYMBOL(__sb_start_write); @@ -1221,37 +1210,33 @@ EXPORT_SYMBOL(__sb_start_write); * @level: type of writers we wait for (normal vs page fault) * * This function waits until there are no writers of given type to given file - * system. Caller of this function should make sure there can be no new writers - * of type @level before calling this function. Otherwise this function can - * livelock. + * system. */ static void sb_wait_write(struct super_block *sb, int level) { - s64 writers; - + percpu_down_write(sb->s_writers.rw_sem + level-1); /* - * We just cycle-through lockdep here so that it does not complain - * about returning with lock to userspace + * We are going to return to userspace and forget about this lock, the + * ownership goes to the caller of thaw_super() which does unlock. + * + * FIXME: we should do this before return from freeze_super() after we + * called sync_filesystem(sb) and s_op->freeze_fs(sb), and thaw_super() + * should re-acquire these locks before s_op->unfreeze_fs(sb). However + * this leads to lockdep false-positives, so currently we do the early + * release right after acquire. */ - rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_); - rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); - - do { - DEFINE_WAIT(wait); + percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_); +} - /* - * We use a barrier in prepare_to_wait() to separate setting - * of frozen and checking of the counter - */ - prepare_to_wait(&sb->s_writers.wait, &wait, - TASK_UNINTERRUPTIBLE); +static void sb_freeze_unlock(struct super_block *sb) +{ + int level; - writers = percpu_counter_sum(&sb->s_writers.counter[level-1]); - if (writers) - schedule(); + for (level = 0; level < SB_FREEZE_LEVELS; ++level) + percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); - finish_wait(&sb->s_writers.wait, &wait); - } while (writers); + for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) + percpu_up_write(sb->s_writers.rw_sem + level); } /** @@ -1310,20 +1295,14 @@ int freeze_super(struct super_block *sb) return 0; } - /* From now on, no new normal writers can start */ sb->s_writers.frozen = SB_FREEZE_WRITE; - smp_wmb(); - /* Release s_umount to preserve sb_start_write -> s_umount ordering */ up_write(&sb->s_umount); - sb_wait_write(sb, SB_FREEZE_WRITE); + down_write(&sb->s_umount); /* Now we go and block page faults... */ - down_write(&sb->s_umount); sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; - smp_wmb(); - sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ @@ -1331,7 +1310,6 @@ int freeze_super(struct super_block *sb) /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; - smp_wmb(); sb_wait_write(sb, SB_FREEZE_FS); if (sb->s_op->freeze_fs) { @@ -1340,7 +1318,7 @@ int freeze_super(struct super_block *sb) printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; - smp_wmb(); + sb_freeze_unlock(sb); wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; @@ -1372,8 +1350,10 @@ int thaw_super(struct super_block *sb) return -EINVAL; } - if (sb->s_flags & MS_RDONLY) + if (sb->s_flags & MS_RDONLY) { + sb->s_writers.frozen = SB_UNFROZEN; goto out; + } if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); @@ -1385,12 +1365,11 @@ int thaw_super(struct super_block *sb) } } -out: sb->s_writers.frozen = SB_UNFROZEN; - smp_wmb(); + sb_freeze_unlock(sb); +out: wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); - return 0; } EXPORT_SYMBOL(thaw_super); diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 0b45ff42f374..94374e435025 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -121,3 +121,37 @@ int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, return kernfs_rename_ns(kn, new_parent, kn->name, new_ns); } + +/** + * sysfs_create_mount_point - create an always empty directory + * @parent_kobj: kobject that will contain this always empty directory + * @name: The name of the always empty directory to add + */ +int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name) +{ + struct kernfs_node *kn, *parent = parent_kobj->sd; + + kn = kernfs_create_empty_dir(parent, name); + if (IS_ERR(kn)) { + if (PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(parent, name); + return PTR_ERR(kn); + } + + return 0; +} +EXPORT_SYMBOL_GPL(sysfs_create_mount_point); + +/** + * sysfs_remove_mount_point - remove an always empty directory. + * @parent_kobj: kobject that will contain this always empty directory + * @name: The name of the always empty directory to remove + * + */ +void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name) +{ + struct kernfs_node *parent = parent_kobj->sd; + + kernfs_remove_by_name_ns(parent, name, NULL); +} +EXPORT_SYMBOL_GPL(sysfs_remove_mount_point); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 8a49486bf30c..f3db82071cfb 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -31,9 +31,6 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, bool new_sb; if (!(flags & MS_KERNMOUNT)) { - if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) - return ERR_PTR(-EPERM); - if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) return ERR_PTR(-EPERM); } @@ -43,6 +40,10 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, SYSFS_MAGIC, &new_sb, ns); if (IS_ERR(root) || !new_sb) kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); + else if (new_sb) + /* Userspace would break if executables appear on sysfs */ + root->d_sb->s_iflags |= SB_I_NOEXEC; + return root; } @@ -58,7 +59,7 @@ static struct file_system_type sysfs_fs_type = { .name = "sysfs", .mount = sysfs_mount, .kill_sb = sysfs_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT, }; int __init sysfs_init(void) diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 8f3555f00c54..63c1bcb224ee 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -33,11 +33,6 @@ static inline void dir_put_page(struct page *page) page_cache_release(page); } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) { struct address_space *mapping = page->mapping; diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index 2c13525131cd..6c212288adcb 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -73,7 +73,7 @@ struct sysv_inode_info { static inline struct sysv_inode_info *SYSV_I(struct inode *inode) { - return list_entry(inode, struct sysv_inode_info, vfs_inode); + return container_of(inode, struct sysv_inode_info, vfs_inode); } static inline struct sysv_sb_info *SYSV_SB(struct super_block *sb) diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index d92bdf3b079a..cbc8d5d2755a 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -496,16 +496,11 @@ struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *pare return dentry; } -static inline int tracefs_positive(struct dentry *dentry) -{ - return dentry->d_inode && !d_unhashed(dentry); -} - static int __tracefs_remove(struct dentry *dentry, struct dentry *parent) { int ret = 0; - if (tracefs_positive(dentry)) { + if (simple_positive(dentry)) { if (dentry->d_inode) { dget(dentry); switch (dentry->d_inode->i_mode & S_IFMT) { @@ -582,7 +577,7 @@ void tracefs_remove_recursive(struct dentry *dentry) */ spin_lock(&parent->d_lock); list_for_each_entry(child, &parent->d_subdirs, d_child) { - if (!tracefs_positive(child)) + if (!simple_positive(child)) continue; /* perhaps simple_empty(child) makes more sense */ @@ -603,7 +598,7 @@ void tracefs_remove_recursive(struct dentry *dentry) * from d_subdirs. When releasing the parent->d_lock we can * no longer trust that the next pointer is valid. * Restart the loop. We'll skip this one with the - * tracefs_positive() check. + * simple_positive() check. */ goto loop; } @@ -631,14 +626,12 @@ bool tracefs_initialized(void) return tracefs_registered; } -static struct kobject *trace_kobj; - static int __init tracefs_init(void) { int retval; - trace_kobj = kobject_create_and_add("tracing", kernel_kobj); - if (!trace_kobj) + retval = sysfs_create_mount_point(kernel_kobj, "tracing"); + if (retval) return -EINVAL; retval = register_filesystem(&trace_fs_type); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 6afac3d561ac..8d0b3ade0ff0 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1652,17 +1652,9 @@ static int udf_update_inode(struct inode *inode, int do_sync) iinfo->i_ext.i_data, inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)); use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE); - use->descTag.tagLocation = - cpu_to_le32(iinfo->i_location.logicalBlockNum); - crclen = sizeof(struct unallocSpaceEntry) + - iinfo->i_lenAlloc - sizeof(struct tag); - use->descTag.descCRCLength = cpu_to_le16(crclen); - use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use + - sizeof(struct tag), - crclen)); - use->descTag.tagChecksum = udf_tag_checksum(&use->descTag); + crclen = sizeof(struct unallocSpaceEntry); - goto out; + goto finish; } if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) @@ -1782,6 +1774,8 @@ static int udf_update_inode(struct inode *inode, int do_sync) efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE); crclen = sizeof(struct extendedFileEntry); } + +finish: if (iinfo->i_strat4096) { fe->icbTag.strategyType = cpu_to_le16(4096); fe->icbTag.strategyParameter = cpu_to_le16(1); @@ -1791,7 +1785,9 @@ static int udf_update_inode(struct inode *inode, int do_sync) fe->icbTag.numEntries = cpu_to_le16(1); } - if (S_ISDIR(inode->i_mode)) + if (iinfo->i_use) + fe->icbTag.fileType = ICBTAG_FILE_TYPE_USE; + else if (S_ISDIR(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_DIRECTORY; else if (S_ISREG(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_REGULAR; @@ -1828,7 +1824,6 @@ static int udf_update_inode(struct inode *inode, int do_sync) crclen)); fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); -out: set_buffer_uptodate(bh); unlock_buffer(bh); diff --git a/fs/udf/super.c b/fs/udf/super.c index b96f190bc567..81155b9b445b 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -2070,6 +2070,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) struct udf_options uopt; struct kernel_lb_addr rootdir, fileset; struct udf_sb_info *sbi; + bool lvid_open = false; uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); uopt.uid = INVALID_UID; @@ -2216,8 +2217,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) le16_to_cpu(ts.year), ts.month, ts.day, ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone)); } - if (!(sb->s_flags & MS_RDONLY)) + if (!(sb->s_flags & MS_RDONLY)) { udf_open_lvid(sb); + lvid_open = true; + } /* Assign the root inode */ /* assign inodes by physical block number */ @@ -2248,7 +2251,7 @@ parse_options_failure: if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) unload_nls(sbi->s_nls_map); #endif - if (!(sb->s_flags & MS_RDONLY)) + if (lvid_open) udf_close_lvid(sb); brelse(sbi->s_lvid_bh); udf_sb_free_partitions(sb); diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h index b5cd8ed2aa12..b1b9a63d8cf3 100644 --- a/fs/udf/udf_i.h +++ b/fs/udf/udf_i.h @@ -56,7 +56,7 @@ struct udf_inode_info { static inline struct udf_inode_info *UDF_I(struct inode *inode) { - return list_entry(inode, struct udf_inode_info, vfs_inode); + return container_of(inode, struct udf_inode_info, vfs_inode); } #endif /* _UDF_I_H) */ diff --git a/fs/ufs/Makefile b/fs/ufs/Makefile index 4d0e02b022b3..392db25c0b56 100644 --- a/fs/ufs/Makefile +++ b/fs/ufs/Makefile @@ -5,5 +5,5 @@ obj-$(CONFIG_UFS_FS) += ufs.o ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \ - namei.o super.o symlink.o truncate.o util.o + namei.o super.o symlink.o util.o ccflags-$(CONFIG_UFS_DEBUG) += -DDEBUG diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 2c1036080d52..dc5fae601c24 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -51,8 +51,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) if (ufs_fragnum(fragment) + count > uspi->s_fpg) ufs_error (sb, "ufs_free_fragments", "internal error"); - - lock_ufs(sb); + + mutex_lock(&UFS_SB(sb)->s_lock); cgno = ufs_dtog(uspi, fragment); bit = ufs_dtogd(uspi, fragment); @@ -115,13 +115,13 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) if (sb->s_flags & MS_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); - - unlock_ufs(sb); + + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT\n"); return; failed: - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT (FAILED)\n"); return; } @@ -151,7 +151,7 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count) goto failed; } - lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); do_more: overflow = 0; @@ -211,12 +211,12 @@ do_more: } ufs_mark_sb_dirty(sb); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT\n"); return; failed_unlock: - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); failed: UFSD("EXIT (FAILED)\n"); return; @@ -357,7 +357,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, usb1 = ubh_get_usb_first(uspi); *err = -ENOSPC; - lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); tmp = ufs_data_ptr_to_cpu(sb, p); if (count + ufs_fragnum(fragment) > uspi->s_fpb) { @@ -378,19 +378,19 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, "fragment %llu, tmp %llu\n", (unsigned long long)fragment, (unsigned long long)tmp); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return INVBLOCK; } if (fragment < UFS_I(inode)->i_lastfrag) { UFSD("EXIT (ALREADY ALLOCATED)\n"); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } } else { if (tmp) { UFSD("EXIT (ALREADY ALLOCATED)\n"); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } } @@ -399,7 +399,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, * There is not enough space for user on the device */ if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) { - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT (FAILED)\n"); return 0; } @@ -417,14 +417,16 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, if (oldcount == 0) { result = ufs_alloc_fragments (inode, cgno, goal, count, err); if (result) { + ufs_clear_frags(inode, result + oldcount, + newcount - oldcount, locked_page != NULL); + write_seqlock(&UFS_I(inode)->meta_lock); ufs_cpu_to_data_ptr(sb, p, result); + write_sequnlock(&UFS_I(inode)->meta_lock); *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); - ufs_clear_frags(inode, result + oldcount, - newcount - oldcount, locked_page != NULL); } - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT, result %llu\n", (unsigned long long)result); return result; } @@ -439,7 +441,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, fragment + count); ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT, result %llu\n", (unsigned long long)result); return result; } @@ -473,11 +475,13 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, ufs_change_blocknr(inode, fragment - oldcount, oldcount, uspi->s_sbbase + tmp, uspi->s_sbbase + result, locked_page); + write_seqlock(&UFS_I(inode)->meta_lock); ufs_cpu_to_data_ptr(sb, p, result); + write_sequnlock(&UFS_I(inode)->meta_lock); *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); if (newcount < request) ufs_free_fragments (inode, result + newcount, request - newcount); ufs_free_fragments (inode, tmp, oldcount); @@ -485,7 +489,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, return result; } - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT (FAILED)\n"); return 0; } diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 1bfe8cabff0f..74f2e80288bf 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -65,11 +65,6 @@ static inline void ufs_put_page(struct page *page) page_cache_release(page); } -static inline unsigned long ufs_dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) { ino_t res = 0; @@ -87,7 +82,8 @@ ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) /* Releases the page */ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct page *page, struct inode *inode) + struct page *page, struct inode *inode, + bool update_times) { loff_t pos = page_offset(page) + (char *) de - (char *) page_address(page); @@ -103,7 +99,8 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, err = ufs_commit_chunk(page, pos, len); ufs_put_page(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + if (update_times) + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(dir); } @@ -256,7 +253,7 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, int namelen = qstr->len; unsigned reclen = UFS_DIR_REC_LEN(namelen); unsigned long start, n; - unsigned long npages = ufs_dir_pages(dir); + unsigned long npages = dir_pages(dir); struct page *page = NULL; struct ufs_inode_info *ui = UFS_I(dir); struct ufs_dir_entry *de; @@ -320,7 +317,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) unsigned short rec_len, name_len; struct page *page = NULL; struct ufs_dir_entry *de; - unsigned long npages = ufs_dir_pages(dir); + unsigned long npages = dir_pages(dir); unsigned long n; char *kaddr; loff_t pos; @@ -437,7 +434,7 @@ ufs_readdir(struct file *file, struct dir_context *ctx) struct super_block *sb = inode->i_sb; unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; - unsigned long npages = ufs_dir_pages(inode); + unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); int need_revalidate = file->f_version != inode->i_version; unsigned flags = UFS_SB(sb)->s_flags; @@ -608,7 +605,7 @@ int ufs_empty_dir(struct inode * inode) { struct super_block *sb = inode->i_sb; struct page *page = NULL; - unsigned long i, npages = ufs_dir_pages(inode); + unsigned long i, npages = dir_pages(inode); for (i = 0; i < npages; i++) { char *kaddr; diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 7caa01652888..fd0203ce1f7f 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -69,11 +69,11 @@ void ufs_free_inode (struct inode * inode) ino = inode->i_ino; - lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); if (!((ino > 1) && (ino < (uspi->s_ncg * uspi->s_ipg )))) { ufs_warning(sb, "ufs_free_inode", "reserved inode or nonexistent inode %u\n", ino); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return; } @@ -81,7 +81,7 @@ void ufs_free_inode (struct inode * inode) bit = ufs_inotocgoff (ino); ucpi = ufs_load_cylinder (sb, cg); if (!ucpi) { - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return; } ucg = ubh_get_ucg(UCPI_UBH(ucpi)); @@ -115,7 +115,7 @@ void ufs_free_inode (struct inode * inode) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT\n"); } @@ -193,7 +193,7 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode) sbi = UFS_SB(sb); uspi = sbi->s_uspi; - lock_ufs(sb); + mutex_lock(&sbi->s_lock); /* * Try to place the inode in its parent directory @@ -331,21 +331,21 @@ cg_found: sync_dirty_buffer(bh); brelse(bh); } - unlock_ufs(sb); + mutex_unlock(&sbi->s_lock); UFSD("allocating inode %lu\n", inode->i_ino); UFSD("EXIT\n"); return inode; fail_remove_inode: - unlock_ufs(sb); + mutex_unlock(&sbi->s_lock); clear_nlink(inode); unlock_new_inode(inode); iput(inode); UFSD("EXIT (FAILED): err %d\n", err); return ERR_PTR(err); failed: - unlock_ufs(sb); + mutex_unlock(&sbi->s_lock); make_bad_inode(inode); iput (inode); UFSD("EXIT (FAILED): err %d\n", err); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 99aaf5c9bf4d..a064cf44b143 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -41,9 +41,7 @@ #include "swab.h" #include "util.h" -static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock); - -static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4]) +static int ufs_block_to_path(struct inode *inode, sector_t i_block, unsigned offsets[4]) { struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi; int ptrs = uspi->s_apb; @@ -75,227 +73,232 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off return n; } +typedef struct { + void *p; + union { + __fs32 key32; + __fs64 key64; + }; + struct buffer_head *bh; +} Indirect; + +static inline int grow_chain32(struct ufs_inode_info *ufsi, + struct buffer_head *bh, __fs32 *v, + Indirect *from, Indirect *to) +{ + Indirect *p; + unsigned seq; + to->bh = bh; + do { + seq = read_seqbegin(&ufsi->meta_lock); + to->key32 = *(__fs32 *)(to->p = v); + for (p = from; p <= to && p->key32 == *(__fs32 *)p->p; p++) + ; + } while (read_seqretry(&ufsi->meta_lock, seq)); + return (p > to); +} + +static inline int grow_chain64(struct ufs_inode_info *ufsi, + struct buffer_head *bh, __fs64 *v, + Indirect *from, Indirect *to) +{ + Indirect *p; + unsigned seq; + to->bh = bh; + do { + seq = read_seqbegin(&ufsi->meta_lock); + to->key64 = *(__fs64 *)(to->p = v); + for (p = from; p <= to && p->key64 == *(__fs64 *)p->p; p++) + ; + } while (read_seqretry(&ufsi->meta_lock, seq)); + return (p > to); +} + /* * Returns the location of the fragment from * the beginning of the filesystem. */ -static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock) +static u64 ufs_frag_map(struct inode *inode, unsigned offsets[4], int depth) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; u64 mask = (u64) uspi->s_apbmask>>uspi->s_fpbshift; int shift = uspi->s_apbshift-uspi->s_fpbshift; - sector_t offsets[4], *p; - int depth = ufs_block_to_path(inode, frag >> uspi->s_fpbshift, offsets); - u64 ret = 0L; - __fs32 block; - __fs64 u2_block = 0L; + Indirect chain[4], *q = chain; + unsigned *p; unsigned flags = UFS_SB(sb)->s_flags; - u64 temp = 0L; + u64 res = 0; - UFSD(": frag = %llu depth = %d\n", (unsigned long long)frag, depth); UFSD(": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n", uspi->s_fpbshift, uspi->s_apbmask, (unsigned long long)mask); if (depth == 0) - return 0; + goto no_block; +again: p = offsets; - if (needs_lock) - lock_ufs(sb); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) goto ufs2; - block = ufsi->i_u1.i_data[*p++]; - if (!block) - goto out; + if (!grow_chain32(ufsi, NULL, &ufsi->i_u1.i_data[*p++], chain, q)) + goto changed; + if (!q->key32) + goto no_block; while (--depth) { + __fs32 *ptr; struct buffer_head *bh; - sector_t n = *p++; + unsigned n = *p++; - bh = sb_bread(sb, uspi->s_sbbase + fs32_to_cpu(sb, block)+(n>>shift)); + bh = sb_bread(sb, uspi->s_sbbase + + fs32_to_cpu(sb, q->key32) + (n>>shift)); if (!bh) - goto out; - block = ((__fs32 *) bh->b_data)[n & mask]; - brelse (bh); - if (!block) - goto out; - } - ret = (u64) (uspi->s_sbbase + fs32_to_cpu(sb, block) + (frag & uspi->s_fpbmask)); - goto out; -ufs2: - u2_block = ufsi->i_u1.u2_i_data[*p++]; - if (!u2_block) - goto out; + goto no_block; + ptr = (__fs32 *)bh->b_data + (n & mask); + if (!grow_chain32(ufsi, bh, ptr, chain, ++q)) + goto changed; + if (!q->key32) + goto no_block; + } + res = fs32_to_cpu(sb, q->key32); + goto found; +ufs2: + if (!grow_chain64(ufsi, NULL, &ufsi->i_u1.u2_i_data[*p++], chain, q)) + goto changed; + if (!q->key64) + goto no_block; while (--depth) { + __fs64 *ptr; struct buffer_head *bh; - sector_t n = *p++; - + unsigned n = *p++; - temp = (u64)(uspi->s_sbbase) + fs64_to_cpu(sb, u2_block); - bh = sb_bread(sb, temp +(u64) (n>>shift)); + bh = sb_bread(sb, uspi->s_sbbase + + fs64_to_cpu(sb, q->key64) + (n>>shift)); if (!bh) - goto out; - u2_block = ((__fs64 *)bh->b_data)[n & mask]; - brelse(bh); - if (!u2_block) - goto out; + goto no_block; + ptr = (__fs64 *)bh->b_data + (n & mask); + if (!grow_chain64(ufsi, bh, ptr, chain, ++q)) + goto changed; + if (!q->key64) + goto no_block; } - temp = (u64)uspi->s_sbbase + fs64_to_cpu(sb, u2_block); - ret = temp + (u64) (frag & uspi->s_fpbmask); + res = fs64_to_cpu(sb, q->key64); +found: + res += uspi->s_sbbase; +no_block: + while (q > chain) { + brelse(q->bh); + q--; + } + return res; -out: - if (needs_lock) - unlock_ufs(sb); - return ret; +changed: + while (q > chain) { + brelse(q->bh); + q--; + } + goto again; +} + +/* + * Unpacking tails: we have a file with partial final block and + * we had been asked to extend it. If the fragment being written + * is within the same block, we need to extend the tail just to cover + * that fragment. Otherwise the tail is extended to full block. + * + * Note that we might need to create a _new_ tail, but that will + * be handled elsewhere; this is strictly for resizing old + * ones. + */ +static bool +ufs_extend_tail(struct inode *inode, u64 writes_to, + int *err, struct page *locked_page) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + unsigned lastfrag = ufsi->i_lastfrag; /* it's a short file, so unsigned is enough */ + unsigned block = ufs_fragstoblks(lastfrag); + unsigned new_size; + void *p; + u64 tmp; + + if (writes_to < (lastfrag | uspi->s_fpbmask)) + new_size = (writes_to & uspi->s_fpbmask) + 1; + else + new_size = uspi->s_fpb; + + p = ufs_get_direct_data_ptr(uspi, ufsi, block); + tmp = ufs_new_fragments(inode, p, lastfrag, ufs_data_ptr_to_cpu(sb, p), + new_size, err, locked_page); + return tmp != 0; } /** * ufs_inode_getfrag() - allocate new fragment(s) * @inode: pointer to inode - * @fragment: number of `fragment' which hold pointer - * to new allocated fragment(s) + * @index: number of block pointer within the inode's array. * @new_fragment: number of new allocated fragment(s) - * @required: how many fragment(s) we require * @err: we set it if something wrong - * @phys: pointer to where we save physical number of new allocated fragments, - * NULL if we allocate not data(indirect blocks for example). * @new: we set it if we allocate new block * @locked_page: for ufs_new_fragments() */ -static struct buffer_head * -ufs_inode_getfrag(struct inode *inode, u64 fragment, - sector_t new_fragment, unsigned int required, int *err, - long *phys, int *new, struct page *locked_page) +static u64 +ufs_inode_getfrag(struct inode *inode, unsigned index, + sector_t new_fragment, int *err, + int *new, struct page *locked_page) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - struct buffer_head * result; - unsigned blockoff, lastblockoff; - u64 tmp, goal, lastfrag, block, lastblock; - void *p, *p2; - - UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, required %u, " - "metadata %d\n", inode->i_ino, (unsigned long long)fragment, - (unsigned long long)new_fragment, required, !phys); + u64 tmp, goal, lastfrag; + unsigned nfrags = uspi->s_fpb; + void *p; /* TODO : to be done for write support if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) goto ufs2; */ - block = ufs_fragstoblks (fragment); - blockoff = ufs_fragnum (fragment); - p = ufs_get_direct_data_ptr(uspi, ufsi, block); - - goal = 0; - -repeat: + p = ufs_get_direct_data_ptr(uspi, ufsi, index); tmp = ufs_data_ptr_to_cpu(sb, p); + if (tmp) + goto out; lastfrag = ufsi->i_lastfrag; - if (tmp && fragment < lastfrag) { - if (!phys) { - result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); - if (tmp == ufs_data_ptr_to_cpu(sb, p)) { - UFSD("EXIT, result %llu\n", - (unsigned long long)tmp + blockoff); - return result; - } - brelse (result); - goto repeat; - } else { - *phys = uspi->s_sbbase + tmp + blockoff; - return NULL; - } - } - lastblock = ufs_fragstoblks (lastfrag); - lastblockoff = ufs_fragnum (lastfrag); - /* - * We will extend file into new block beyond last allocated block - */ - if (lastblock < block) { - /* - * We must reallocate last allocated block - */ - if (lastblockoff) { - p2 = ufs_get_direct_data_ptr(uspi, ufsi, lastblock); - tmp = ufs_new_fragments(inode, p2, lastfrag, - ufs_data_ptr_to_cpu(sb, p2), - uspi->s_fpb - lastblockoff, - err, locked_page); - if (!tmp) { - if (lastfrag != ufsi->i_lastfrag) - goto repeat; - else - return NULL; - } - lastfrag = ufsi->i_lastfrag; - - } - tmp = ufs_data_ptr_to_cpu(sb, - ufs_get_direct_data_ptr(uspi, ufsi, - lastblock)); - if (tmp) - goal = tmp + uspi->s_fpb; - tmp = ufs_new_fragments (inode, p, fragment - blockoff, - goal, required + blockoff, - err, - phys != NULL ? locked_page : NULL); - } else if (lastblock == block) { - /* - * We will extend last allocated block - */ - tmp = ufs_new_fragments(inode, p, fragment - - (blockoff - lastblockoff), - ufs_data_ptr_to_cpu(sb, p), - required + (blockoff - lastblockoff), - err, phys != NULL ? locked_page : NULL); - } else /* (lastblock > block) */ { - /* - * We will allocate new block before last allocated block - */ - if (block) { - tmp = ufs_data_ptr_to_cpu(sb, - ufs_get_direct_data_ptr(uspi, ufsi, block - 1)); - if (tmp) - goal = tmp + uspi->s_fpb; - } - tmp = ufs_new_fragments(inode, p, fragment - blockoff, - goal, uspi->s_fpb, err, - phys != NULL ? locked_page : NULL); + /* will that be a new tail? */ + if (new_fragment < UFS_NDIR_FRAGMENT && new_fragment >= lastfrag) + nfrags = (new_fragment & uspi->s_fpbmask) + 1; + + goal = 0; + if (index) { + goal = ufs_data_ptr_to_cpu(sb, + ufs_get_direct_data_ptr(uspi, ufsi, index - 1)); + if (goal) + goal += uspi->s_fpb; } + tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), + goal, uspi->s_fpb, err, locked_page); + if (!tmp) { - if ((!blockoff && ufs_data_ptr_to_cpu(sb, p)) || - (blockoff && lastfrag != ufsi->i_lastfrag)) - goto repeat; *err = -ENOSPC; - return NULL; + return 0; } - if (!phys) { - result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); - } else { - *phys = uspi->s_sbbase + tmp + blockoff; - result = NULL; - *err = 0; + if (new) *new = 1; - } - inode->i_ctime = CURRENT_TIME_SEC; if (IS_SYNC(inode)) ufs_sync_inode (inode); mark_inode_dirty(inode); - UFSD("EXIT, result %llu\n", (unsigned long long)tmp + blockoff); - return result; +out: + return tmp + uspi->s_sbbase; /* This part : To be implemented .... Required only for writing, not required for READ-ONLY. @@ -316,95 +319,70 @@ repeat2: /** * ufs_inode_getblock() - allocate new block * @inode: pointer to inode - * @bh: pointer to block which hold "pointer" to new allocated block - * @fragment: number of `fragment' which hold pointer - * to new allocated block + * @ind_block: block number of the indirect block + * @index: number of pointer within the indirect block * @new_fragment: number of new allocated fragment * (block will hold this fragment and also uspi->s_fpb-1) * @err: see ufs_inode_getfrag() - * @phys: see ufs_inode_getfrag() * @new: see ufs_inode_getfrag() * @locked_page: see ufs_inode_getfrag() */ -static struct buffer_head * -ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, - u64 fragment, sector_t new_fragment, int *err, - long *phys, int *new, struct page *locked_page) +static u64 +ufs_inode_getblock(struct inode *inode, u64 ind_block, + unsigned index, sector_t new_fragment, int *err, + int *new, struct page *locked_page) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - struct buffer_head * result; - unsigned blockoff; - u64 tmp, goal, block; + int shift = uspi->s_apbshift - uspi->s_fpbshift; + u64 tmp = 0, goal; + struct buffer_head *bh; void *p; - block = ufs_fragstoblks (fragment); - blockoff = ufs_fragnum (fragment); - - UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, metadata %d\n", - inode->i_ino, (unsigned long long)fragment, - (unsigned long long)new_fragment, !phys); + if (!ind_block) + return 0; - result = NULL; - if (!bh) - goto out; - if (!buffer_uptodate(bh)) { - ll_rw_block (READ, 1, &bh); - wait_on_buffer (bh); - if (!buffer_uptodate(bh)) - goto out; + bh = sb_bread(sb, ind_block + (index >> shift)); + if (unlikely(!bh)) { + *err = -EIO; + return 0; } + + index &= uspi->s_apbmask >> uspi->s_fpbshift; if (uspi->fs_magic == UFS2_MAGIC) - p = (__fs64 *)bh->b_data + block; + p = (__fs64 *)bh->b_data + index; else - p = (__fs32 *)bh->b_data + block; -repeat: + p = (__fs32 *)bh->b_data + index; + tmp = ufs_data_ptr_to_cpu(sb, p); - if (tmp) { - if (!phys) { - result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); - if (tmp == ufs_data_ptr_to_cpu(sb, p)) - goto out; - brelse (result); - goto repeat; - } else { - *phys = uspi->s_sbbase + tmp + blockoff; - goto out; - } - } + if (tmp) + goto out; - if (block && (uspi->fs_magic == UFS2_MAGIC ? - (tmp = fs64_to_cpu(sb, ((__fs64 *)bh->b_data)[block-1])) : - (tmp = fs32_to_cpu(sb, ((__fs32 *)bh->b_data)[block-1])))) + if (index && (uspi->fs_magic == UFS2_MAGIC ? + (tmp = fs64_to_cpu(sb, ((__fs64 *)bh->b_data)[index-1])) : + (tmp = fs32_to_cpu(sb, ((__fs32 *)bh->b_data)[index-1])))) goal = tmp + uspi->s_fpb; else goal = bh->b_blocknr + uspi->s_fpb; tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal, uspi->s_fpb, err, locked_page); - if (!tmp) { - if (ufs_data_ptr_to_cpu(sb, p)) - goto repeat; + if (!tmp) goto out; - } - - if (!phys) { - result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); - } else { - *phys = uspi->s_sbbase + tmp + blockoff; + if (new) *new = 1; - } mark_buffer_dirty(bh); if (IS_SYNC(inode)) sync_dirty_buffer(bh); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - UFSD("result %llu\n", (unsigned long long)tmp + blockoff); out: brelse (bh); UFSD("EXIT\n"); - return result; + if (tmp) + tmp += uspi->s_sbbase; + return tmp; } /** @@ -412,103 +390,64 @@ out: * readpage, writepage and so on */ -int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) +static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) { - struct super_block * sb = inode->i_sb; - struct ufs_sb_info * sbi = UFS_SB(sb); - struct ufs_sb_private_info * uspi = sbi->s_uspi; - struct buffer_head * bh; - int ret, err, new; - unsigned long ptr,phys; + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + int err = 0, new = 0; + unsigned offsets[4]; + int depth = ufs_block_to_path(inode, fragment >> uspi->s_fpbshift, offsets); u64 phys64 = 0; - bool needs_lock = (sbi->mutex_owner != current); - + unsigned frag = fragment & uspi->s_fpbmask; + if (!create) { - phys64 = ufs_frag_map(inode, fragment, needs_lock); - UFSD("phys64 = %llu\n", (unsigned long long)phys64); - if (phys64) - map_bh(bh_result, sb, phys64); - return 0; + phys64 = ufs_frag_map(inode, offsets, depth); + goto out; } /* This code entered only while writing ....? */ - err = -EIO; - new = 0; - ret = 0; - bh = NULL; - - if (needs_lock) - lock_ufs(sb); + mutex_lock(&UFS_I(inode)->truncate_mutex); UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment); - if (fragment > - ((UFS_NDADDR + uspi->s_apb + uspi->s_2apb + uspi->s_3apb) - << uspi->s_fpbshift)) - goto abort_too_big; - - err = 0; - ptr = fragment; - - /* - * ok, these macros clean the logic up a bit and make - * it much more readable: - */ -#define GET_INODE_DATABLOCK(x) \ - ufs_inode_getfrag(inode, x, fragment, 1, &err, &phys, &new,\ - bh_result->b_page) -#define GET_INODE_PTR(x) \ - ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, NULL, NULL,\ - bh_result->b_page) -#define GET_INDIRECT_DATABLOCK(x) \ - ufs_inode_getblock(inode, bh, x, fragment, \ - &err, &phys, &new, bh_result->b_page) -#define GET_INDIRECT_PTR(x) \ - ufs_inode_getblock(inode, bh, x, fragment, \ - &err, NULL, NULL, NULL) - - if (ptr < UFS_NDIR_FRAGMENT) { - bh = GET_INODE_DATABLOCK(ptr); + if (unlikely(!depth)) { + ufs_warning(sb, "ufs_get_block", "block > big"); + err = -EIO; goto out; } - ptr -= UFS_NDIR_FRAGMENT; - if (ptr < (1 << (uspi->s_apbshift + uspi->s_fpbshift))) { - bh = GET_INODE_PTR(UFS_IND_FRAGMENT + (ptr >> uspi->s_apbshift)); - goto get_indirect; - } - ptr -= 1 << (uspi->s_apbshift + uspi->s_fpbshift); - if (ptr < (1 << (uspi->s_2apbshift + uspi->s_fpbshift))) { - bh = GET_INODE_PTR(UFS_DIND_FRAGMENT + (ptr >> uspi->s_2apbshift)); - goto get_double; - } - ptr -= 1 << (uspi->s_2apbshift + uspi->s_fpbshift); - bh = GET_INODE_PTR(UFS_TIND_FRAGMENT + (ptr >> uspi->s_3apbshift)); - bh = GET_INDIRECT_PTR((ptr >> uspi->s_2apbshift) & uspi->s_apbmask); -get_double: - bh = GET_INDIRECT_PTR((ptr >> uspi->s_apbshift) & uspi->s_apbmask); -get_indirect: - bh = GET_INDIRECT_DATABLOCK(ptr & uspi->s_apbmask); - -#undef GET_INODE_DATABLOCK -#undef GET_INODE_PTR -#undef GET_INDIRECT_DATABLOCK -#undef GET_INDIRECT_PTR -out: - if (err) - goto abort; - if (new) - set_buffer_new(bh_result); - map_bh(bh_result, sb, phys); -abort: - if (needs_lock) - unlock_ufs(sb); + if (UFS_I(inode)->i_lastfrag < UFS_NDIR_FRAGMENT) { + unsigned lastfrag = UFS_I(inode)->i_lastfrag; + unsigned tailfrags = lastfrag & uspi->s_fpbmask; + if (tailfrags && fragment >= lastfrag) { + if (!ufs_extend_tail(inode, fragment, + &err, bh_result->b_page)) + goto out; + } + } + if (depth == 1) { + phys64 = ufs_inode_getfrag(inode, offsets[0], fragment, + &err, &new, bh_result->b_page); + } else { + int i; + phys64 = ufs_inode_getfrag(inode, offsets[0], fragment, + &err, NULL, NULL); + for (i = 1; i < depth - 1; i++) + phys64 = ufs_inode_getblock(inode, phys64, offsets[i], + fragment, &err, NULL, NULL); + phys64 = ufs_inode_getblock(inode, phys64, offsets[depth - 1], + fragment, &err, &new, bh_result->b_page); + } +out: + if (phys64) { + phys64 += frag; + map_bh(bh_result, sb, phys64); + if (new) + set_buffer_new(bh_result); + } + mutex_unlock(&UFS_I(inode)->truncate_mutex); return err; - -abort_too_big: - ufs_warning(sb, "ufs_get_block", "block > big"); - goto abort; } static int ufs_writepage(struct page *page, struct writeback_control *wbc) @@ -526,12 +465,16 @@ int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len) return __block_write_begin(page, pos, len, ufs_getfrag_block); } +static void ufs_truncate_blocks(struct inode *); + static void ufs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; - if (to > inode->i_size) + if (to > inode->i_size) { truncate_pagecache(inode, inode->i_size); + ufs_truncate_blocks(inode); + } } static int ufs_write_begin(struct file *file, struct address_space *mapping, @@ -548,6 +491,18 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping, return ret; } +static int ufs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int ret; + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (ret < len) + ufs_write_failed(mapping, pos + len); + return ret; +} + static sector_t ufs_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping,block,ufs_getfrag_block); @@ -557,7 +512,7 @@ const struct address_space_operations ufs_aops = { .readpage = ufs_readpage, .writepage = ufs_writepage, .write_begin = ufs_write_begin, - .write_end = generic_write_end, + .write_end = ufs_write_end, .bmap = ufs_bmap }; @@ -599,7 +554,7 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); return -1; } - + /* * Linux now has 32-bit uid and gid, so we can support EFT. */ @@ -619,7 +574,7 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow); ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag); - + if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) { memcpy(ufsi->i_u1.i_data, &ufs_inode->ui_u2.ui_addr, sizeof(ufs_inode->ui_u2.ui_addr)); @@ -753,7 +708,7 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode) ufs_set_inode_uid(sb, ufs_inode, i_uid_read(inode)); ufs_set_inode_gid(sb, ufs_inode, i_gid_read(inode)); - + ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size); ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec); ufs_inode->ui_atime.tv_usec = 0; @@ -855,23 +810,19 @@ static int ufs_update_inode(struct inode * inode, int do_sync) ufs1_update_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino)); } - + mark_buffer_dirty(bh); if (do_sync) sync_dirty_buffer(bh); brelse (bh); - + UFSD("EXIT\n"); return 0; } int ufs_write_inode(struct inode *inode, struct writeback_control *wbc) { - int ret; - lock_ufs(inode->i_sb); - ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); - unlock_ufs(inode->i_sb); - return ret; + return ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } int ufs_sync_inode (struct inode *inode) @@ -888,16 +839,9 @@ void ufs_evict_inode(struct inode * inode) truncate_inode_pages_final(&inode->i_data); if (want_delete) { - loff_t old_i_size; - /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/ - lock_ufs(inode->i_sb); - mark_inode_dirty(inode); - ufs_update_inode(inode, IS_SYNC(inode)); - old_i_size = inode->i_size; inode->i_size = 0; - if (inode->i_blocks && ufs_truncate(inode, old_i_size)) - ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n"); - unlock_ufs(inode->i_sb); + if (inode->i_blocks) + ufs_truncate_blocks(inode); } invalidate_inode_buffers(inode); @@ -906,3 +850,378 @@ void ufs_evict_inode(struct inode * inode) if (want_delete) ufs_free_inode(inode); } + +struct to_free { + struct inode *inode; + u64 to; + unsigned count; +}; + +static inline void free_data(struct to_free *ctx, u64 from, unsigned count) +{ + if (ctx->count && ctx->to != from) { + ufs_free_blocks(ctx->inode, ctx->to - ctx->count, ctx->count); + ctx->count = 0; + } + ctx->count += count; + ctx->to = from + count; +} + +#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift) +#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) + +static void ufs_trunc_direct(struct inode *inode) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block * sb; + struct ufs_sb_private_info * uspi; + void *p; + u64 frag1, frag2, frag3, frag4, block1, block2; + struct to_free ctx = {.inode = inode}; + unsigned i, tmp; + + UFSD("ENTER: ino %lu\n", inode->i_ino); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + + frag1 = DIRECT_FRAGMENT; + frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag); + frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1); + frag3 = frag4 & ~uspi->s_fpbmask; + block1 = block2 = 0; + if (frag2 > frag3) { + frag2 = frag4; + frag3 = frag4 = 0; + } else if (frag2 < frag3) { + block1 = ufs_fragstoblks (frag2); + block2 = ufs_fragstoblks (frag3); + } + + UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu," + " frag3 %llu, frag4 %llu\n", inode->i_ino, + (unsigned long long)frag1, (unsigned long long)frag2, + (unsigned long long)block1, (unsigned long long)block2, + (unsigned long long)frag3, (unsigned long long)frag4); + + if (frag1 >= frag2) + goto next1; + + /* + * Free first free fragments + */ + p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1)); + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp ) + ufs_panic (sb, "ufs_trunc_direct", "internal error"); + frag2 -= frag1; + frag1 = ufs_fragnum (frag1); + + ufs_free_fragments(inode, tmp + frag1, frag2); + +next1: + /* + * Free whole blocks + */ + for (i = block1 ; i < block2; i++) { + p = ufs_get_direct_data_ptr(uspi, ufsi, i); + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp) + continue; + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + + free_data(&ctx, tmp, uspi->s_fpb); + } + + free_data(&ctx, 0, 0); + + if (frag3 >= frag4) + goto next3; + + /* + * Free last free fragments + */ + p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3)); + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp ) + ufs_panic(sb, "ufs_truncate_direct", "internal error"); + frag4 = ufs_fragnum (frag4); + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + + ufs_free_fragments (inode, tmp, frag4); + next3: + + UFSD("EXIT: ino %lu\n", inode->i_ino); +} + +static void free_full_branch(struct inode *inode, u64 ind_block, int depth) +{ + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct ufs_buffer_head *ubh = ubh_bread(sb, ind_block, uspi->s_bsize); + unsigned i; + + if (!ubh) + return; + + if (--depth) { + for (i = 0; i < uspi->s_apb; i++) { + void *p = ubh_get_data_ptr(uspi, ubh, i); + u64 block = ufs_data_ptr_to_cpu(sb, p); + if (block) + free_full_branch(inode, block, depth); + } + } else { + struct to_free ctx = {.inode = inode}; + + for (i = 0; i < uspi->s_apb; i++) { + void *p = ubh_get_data_ptr(uspi, ubh, i); + u64 block = ufs_data_ptr_to_cpu(sb, p); + if (block) + free_data(&ctx, block, uspi->s_fpb); + } + free_data(&ctx, 0, 0); + } + + ubh_bforget(ubh); + ufs_free_blocks(inode, ind_block, uspi->s_fpb); +} + +static void free_branch_tail(struct inode *inode, unsigned from, struct ufs_buffer_head *ubh, int depth) +{ + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + unsigned i; + + if (--depth) { + for (i = from; i < uspi->s_apb ; i++) { + void *p = ubh_get_data_ptr(uspi, ubh, i); + u64 block = ufs_data_ptr_to_cpu(sb, p); + if (block) { + write_seqlock(&UFS_I(inode)->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&UFS_I(inode)->meta_lock); + ubh_mark_buffer_dirty(ubh); + free_full_branch(inode, block, depth); + } + } + } else { + struct to_free ctx = {.inode = inode}; + + for (i = from; i < uspi->s_apb; i++) { + void *p = ubh_get_data_ptr(uspi, ubh, i); + u64 block = ufs_data_ptr_to_cpu(sb, p); + if (block) { + write_seqlock(&UFS_I(inode)->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&UFS_I(inode)->meta_lock); + ubh_mark_buffer_dirty(ubh); + free_data(&ctx, block, uspi->s_fpb); + } + } + free_data(&ctx, 0, 0); + } + if (IS_SYNC(inode) && ubh_buffer_dirty(ubh)) + ubh_sync_block(ubh); + ubh_brelse(ubh); +} + +static int ufs_alloc_lastblock(struct inode *inode, loff_t size) +{ + int err = 0; + struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + unsigned i, end; + sector_t lastfrag; + struct page *lastpage; + struct buffer_head *bh; + u64 phys64; + + lastfrag = (size + uspi->s_fsize - 1) >> uspi->s_fshift; + + if (!lastfrag) + goto out; + + lastfrag--; + + lastpage = ufs_get_locked_page(mapping, lastfrag >> + (PAGE_CACHE_SHIFT - inode->i_blkbits)); + if (IS_ERR(lastpage)) { + err = -EIO; + goto out; + } + + end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1); + bh = page_buffers(lastpage); + for (i = 0; i < end; ++i) + bh = bh->b_this_page; + + + err = ufs_getfrag_block(inode, lastfrag, bh, 1); + + if (unlikely(err)) + goto out_unlock; + + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + /* + * we do not zeroize fragment, because of + * if it maped to hole, it already contains zeroes + */ + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + set_page_dirty(lastpage); + } + + if (lastfrag >= UFS_IND_FRAGMENT) { + end = uspi->s_fpb - ufs_fragnum(lastfrag) - 1; + phys64 = bh->b_blocknr + 1; + for (i = 0; i < end; ++i) { + bh = sb_getblk(sb, i + phys64); + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + sync_dirty_buffer(bh); + brelse(bh); + } + } +out_unlock: + ufs_put_locked_page(lastpage); +out: + return err; +} + +static void __ufs_truncate_blocks(struct inode *inode) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + unsigned offsets[4]; + int depth = ufs_block_to_path(inode, DIRECT_BLOCK, offsets); + int depth2; + unsigned i; + struct ufs_buffer_head *ubh[3]; + void *p; + u64 block; + + if (!depth) + return; + + /* find the last non-zero in offsets[] */ + for (depth2 = depth - 1; depth2; depth2--) + if (offsets[depth2]) + break; + + mutex_lock(&ufsi->truncate_mutex); + if (depth == 1) { + ufs_trunc_direct(inode); + offsets[0] = UFS_IND_BLOCK; + } else { + /* get the blocks that should be partially emptied */ + p = ufs_get_direct_data_ptr(uspi, ufsi, offsets[0]); + for (i = 0; i < depth2; i++) { + offsets[i]++; /* next branch is fully freed */ + block = ufs_data_ptr_to_cpu(sb, p); + if (!block) + break; + ubh[i] = ubh_bread(sb, block, uspi->s_bsize); + if (!ubh[i]) { + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + break; + } + p = ubh_get_data_ptr(uspi, ubh[i], offsets[i + 1]); + } + while (i--) + free_branch_tail(inode, offsets[i + 1], ubh[i], depth - i - 1); + } + for (i = offsets[0]; i <= UFS_TIND_BLOCK; i++) { + p = ufs_get_direct_data_ptr(uspi, ufsi, i); + block = ufs_data_ptr_to_cpu(sb, p); + if (block) { + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + free_full_branch(inode, block, i - UFS_IND_BLOCK + 1); + } + } + ufsi->i_lastfrag = DIRECT_FRAGMENT; + mark_inode_dirty(inode); + mutex_unlock(&ufsi->truncate_mutex); +} + +static int ufs_truncate(struct inode *inode, loff_t size) +{ + int err = 0; + + UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n", + inode->i_ino, (unsigned long long)size, + (unsigned long long)i_size_read(inode)); + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return -EINVAL; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return -EPERM; + + err = ufs_alloc_lastblock(inode, size); + + if (err) + goto out; + + block_truncate_page(inode->i_mapping, size, ufs_getfrag_block); + + truncate_setsize(inode, size); + + __ufs_truncate_blocks(inode); + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +out: + UFSD("EXIT: err %d\n", err); + return err; +} + +void ufs_truncate_blocks(struct inode *inode) +{ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + __ufs_truncate_blocks(inode); +} + +int ufs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + unsigned int ia_valid = attr->ia_valid; + int error; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { + error = ufs_truncate(inode, attr->ia_size); + if (error) + return error; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +const struct inode_operations ufs_file_inode_operations = { + .setattr = ufs_setattr, +}; diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index f773deb1d2e3..47966554317c 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -56,11 +56,9 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, unsi if (dentry->d_name.len > UFS_MAXNAMLEN) return ERR_PTR(-ENAMETOOLONG); - lock_ufs(dir->i_sb); ino = ufs_inode_by_name(dir, &dentry->d_name); if (ino) inode = ufs_iget(dir->i_sb, ino); - unlock_ufs(dir->i_sb); return d_splice_alias(inode, dentry); } @@ -76,24 +74,16 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) { struct inode *inode; - int err; - - UFSD("BEGIN\n"); inode = ufs_new_inode(dir, mode); - err = PTR_ERR(inode); + if (IS_ERR(inode)) + return PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &ufs_file_inode_operations; - inode->i_fop = &ufs_file_operations; - inode->i_mapping->a_ops = &ufs_aops; - mark_inode_dirty(inode); - lock_ufs(dir->i_sb); - err = ufs_add_nondir(dentry, inode); - unlock_ufs(dir->i_sb); - } - UFSD("END: err=%d\n", err); - return err; + inode->i_op = &ufs_file_inode_operations; + inode->i_fop = &ufs_file_operations; + inode->i_mapping->a_ops = &ufs_aops; + mark_inode_dirty(inode); + return ufs_add_nondir(dentry, inode); } static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) @@ -110,9 +100,7 @@ static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev init_special_inode(inode, mode, rdev); ufs_set_inode_dev(inode->i_sb, UFS_I(inode), rdev); mark_inode_dirty(inode); - lock_ufs(dir->i_sb); err = ufs_add_nondir(dentry, inode); - unlock_ufs(dir->i_sb); } return err; } @@ -121,19 +109,18 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, const char * symname) { struct super_block * sb = dir->i_sb; - int err = -ENAMETOOLONG; + int err; unsigned l = strlen(symname)+1; struct inode * inode; if (l > sb->s_blocksize) - goto out_notlocked; + return -ENAMETOOLONG; inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); err = PTR_ERR(inode); if (IS_ERR(inode)) - goto out_notlocked; + return err; - lock_ufs(dir->i_sb); if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) { /* slow symlink */ inode->i_op = &ufs_symlink_inode_operations; @@ -150,17 +137,13 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, } mark_inode_dirty(inode); - err = ufs_add_nondir(dentry, inode); -out: - unlock_ufs(dir->i_sb); -out_notlocked: - return err; + return ufs_add_nondir(dentry, inode); out_fail: inode_dec_link_count(inode); unlock_new_inode(inode); iput(inode); - goto out; + return err; } static int ufs_link (struct dentry * old_dentry, struct inode * dir, @@ -169,14 +152,16 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, struct inode *inode = d_inode(old_dentry); int error; - lock_ufs(dir->i_sb); - inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); ihold(inode); - error = ufs_add_nondir(dentry, inode); - unlock_ufs(dir->i_sb); + error = ufs_add_link(dentry, inode); + if (error) { + inode_dec_link_count(inode); + iput(inode); + } else + d_instantiate(dentry, inode); return error; } @@ -185,9 +170,12 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) struct inode * inode; int err; + inode_inc_link_count(dir); + inode = ufs_new_inode(dir, S_IFDIR|mode); + err = PTR_ERR(inode); if (IS_ERR(inode)) - return PTR_ERR(inode); + goto out_dir; inode->i_op = &ufs_dir_inode_operations; inode->i_fop = &ufs_dir_operations; @@ -195,9 +183,6 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) inode_inc_link_count(inode); - lock_ufs(dir->i_sb); - inode_inc_link_count(dir); - err = ufs_make_empty(inode, dir); if (err) goto out_fail; @@ -205,20 +190,19 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) err = ufs_add_link(dentry, inode); if (err) goto out_fail; - unlock_ufs(dir->i_sb); + unlock_new_inode(inode); d_instantiate(dentry, inode); -out: - return err; + return 0; out_fail: inode_dec_link_count(inode); inode_dec_link_count(inode); unlock_new_inode(inode); iput (inode); +out_dir: inode_dec_link_count(dir); - unlock_ufs(dir->i_sb); - goto out; + return err; } static int ufs_unlink(struct inode *dir, struct dentry *dentry) @@ -248,7 +232,6 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) struct inode * inode = d_inode(dentry); int err= -ENOTEMPTY; - lock_ufs(dir->i_sb); if (ufs_empty_dir (inode)) { err = ufs_unlink(dir, dentry); if (!err) { @@ -257,7 +240,6 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) inode_dec_link_count(dir); } } - unlock_ufs(dir->i_sb); return err; } @@ -295,7 +277,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page); if (!new_de) goto out_dir; - ufs_set_link(new_dir, new_de, new_page, old_inode); + ufs_set_link(new_dir, new_de, new_page, old_inode, 1); new_inode->i_ctime = CURRENT_TIME_SEC; if (dir_de) drop_nlink(new_inode); @@ -318,7 +300,12 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, mark_inode_dirty(old_inode); if (dir_de) { - ufs_set_link(old_inode, dir_de, dir_page, new_dir); + if (old_dir != new_dir) + ufs_set_link(old_inode, dir_de, dir_page, new_dir, 0); + else { + kunmap(dir_page); + page_cache_release(dir_page); + } inode_dec_link_count(old_dir); } return 0; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 098508a93c7b..f6390eec02ca 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -94,22 +94,6 @@ #include "swab.h" #include "util.h" -void lock_ufs(struct super_block *sb) -{ - struct ufs_sb_info *sbi = UFS_SB(sb); - - mutex_lock(&sbi->mutex); - sbi->mutex_owner = current; -} - -void unlock_ufs(struct super_block *sb) -{ - struct ufs_sb_info *sbi = UFS_SB(sb); - - sbi->mutex_owner = NULL; - mutex_unlock(&sbi->mutex); -} - static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; @@ -694,7 +678,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait) struct ufs_super_block_third * usb3; unsigned flags; - lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); UFSD("ENTER\n"); @@ -712,7 +696,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait) ufs_put_cstotal(sb); UFSD("EXIT\n"); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } @@ -756,7 +740,6 @@ static void ufs_put_super(struct super_block *sb) ubh_brelse_uspi (sbi->s_uspi); kfree (sbi->s_uspi); - mutex_destroy(&sbi->mutex); kfree (sbi); sb->s_fs_info = NULL; UFSD("EXIT\n"); @@ -799,7 +782,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY)); - mutex_init(&sbi->mutex); + mutex_init(&sbi->s_lock); spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); /* @@ -1254,7 +1237,6 @@ magic_found: return 0; failed: - mutex_destroy(&sbi->mutex); if (ubh) ubh_brelse_uspi (uspi); kfree (uspi); @@ -1277,7 +1259,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) unsigned flags; sync_filesystem(sb); - lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); uspi = UFS_SB(sb)->s_uspi; flags = UFS_SB(sb)->s_flags; usb1 = ubh_get_usb_first(uspi); @@ -1291,20 +1273,20 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) new_mount_opt = 0; ufs_set_opt (new_mount_opt, ONERROR_LOCK); if (!ufs_parse_options (data, &new_mount_opt)) { - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return -EINVAL; } if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { new_mount_opt |= ufstype; } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { pr_err("ufstype can't be changed during remount\n"); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return -EINVAL; } if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { UFS_SB(sb)->s_mount_opt = new_mount_opt; - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } @@ -1327,7 +1309,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) */ #ifndef CONFIG_UFS_FS_WRITE pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n"); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return -EINVAL; #else if (ufstype != UFS_MOUNT_UFSTYPE_SUN && @@ -1336,19 +1318,19 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && ufstype != UFS_MOUNT_UFSTYPE_UFS2) { pr_err("this ufstype is read-only supported\n"); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return -EINVAL; } if (!ufs_read_cylinder_structures(sb)) { pr_err("failed during remounting\n"); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return -EPERM; } sb->s_flags &= ~MS_RDONLY; #endif } UFS_SB(sb)->s_mount_opt = new_mount_opt; - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } @@ -1380,8 +1362,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) struct ufs_super_block_third *usb3; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); - lock_ufs(sb); - + mutex_lock(&UFS_SB(sb)->s_lock); usb3 = ubh_get_usb_third(uspi); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { @@ -1402,7 +1383,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } @@ -1418,6 +1399,8 @@ static struct inode *ufs_alloc_inode(struct super_block *sb) return NULL; ei->vfs_inode.i_version = 1; + seqlock_init(&ei->meta_lock); + mutex_init(&ei->truncate_mutex); return &ei->vfs_inode; } diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c deleted file mode 100644 index 21154704c168..000000000000 --- a/fs/ufs/truncate.c +++ /dev/null @@ -1,523 +0,0 @@ -/* - * linux/fs/ufs/truncate.c - * - * Copyright (C) 1998 - * Daniel Pirkl <daniel.pirkl@email.cz> - * Charles University, Faculty of Mathematics and Physics - * - * from - * - * linux/fs/ext2/truncate.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/truncate.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - */ - -/* - * Real random numbers for secure rm added 94/02/18 - * Idea from Pierre del Perugia <delperug@gla.ecoledoc.ibp.fr> - */ - -/* - * Adoptation to use page cache and UFS2 write support by - * Evgeniy Dushistov <dushistov@mail.ru>, 2006-2007 - */ - -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/fcntl.h> -#include <linux/time.h> -#include <linux/stat.h> -#include <linux/string.h> -#include <linux/buffer_head.h> -#include <linux/blkdev.h> -#include <linux/sched.h> - -#include "ufs_fs.h" -#include "ufs.h" -#include "swab.h" -#include "util.h" - -/* - * Secure deletion currently doesn't work. It interacts very badly - * with buffers shared with memory mappings, and for that reason - * can't be done in the truncate() routines. It should instead be - * done separately in "release()" before calling the truncate routines - * that will release the actual file blocks. - * - * Linus - */ - -#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift) -#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) - - -static int ufs_trunc_direct(struct inode *inode) -{ - struct ufs_inode_info *ufsi = UFS_I(inode); - struct super_block * sb; - struct ufs_sb_private_info * uspi; - void *p; - u64 frag1, frag2, frag3, frag4, block1, block2; - unsigned frag_to_free, free_count; - unsigned i, tmp; - int retry; - - UFSD("ENTER: ino %lu\n", inode->i_ino); - - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; - - frag_to_free = 0; - free_count = 0; - retry = 0; - - frag1 = DIRECT_FRAGMENT; - frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag); - frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1); - frag3 = frag4 & ~uspi->s_fpbmask; - block1 = block2 = 0; - if (frag2 > frag3) { - frag2 = frag4; - frag3 = frag4 = 0; - } else if (frag2 < frag3) { - block1 = ufs_fragstoblks (frag2); - block2 = ufs_fragstoblks (frag3); - } - - UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu," - " frag3 %llu, frag4 %llu\n", inode->i_ino, - (unsigned long long)frag1, (unsigned long long)frag2, - (unsigned long long)block1, (unsigned long long)block2, - (unsigned long long)frag3, (unsigned long long)frag4); - - if (frag1 >= frag2) - goto next1; - - /* - * Free first free fragments - */ - p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1)); - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp ) - ufs_panic (sb, "ufs_trunc_direct", "internal error"); - frag2 -= frag1; - frag1 = ufs_fragnum (frag1); - - ufs_free_fragments(inode, tmp + frag1, frag2); - mark_inode_dirty(inode); - frag_to_free = tmp + frag1; - -next1: - /* - * Free whole blocks - */ - for (i = block1 ; i < block2; i++) { - p = ufs_get_direct_data_ptr(uspi, ufsi, i); - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp) - continue; - ufs_data_ptr_clear(uspi, p); - - if (free_count == 0) { - frag_to_free = tmp; - free_count = uspi->s_fpb; - } else if (free_count > 0 && frag_to_free == tmp - free_count) - free_count += uspi->s_fpb; - else { - ufs_free_blocks (inode, frag_to_free, free_count); - frag_to_free = tmp; - free_count = uspi->s_fpb; - } - mark_inode_dirty(inode); - } - - if (free_count > 0) - ufs_free_blocks (inode, frag_to_free, free_count); - - if (frag3 >= frag4) - goto next3; - - /* - * Free last free fragments - */ - p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3)); - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp ) - ufs_panic(sb, "ufs_truncate_direct", "internal error"); - frag4 = ufs_fragnum (frag4); - ufs_data_ptr_clear(uspi, p); - - ufs_free_fragments (inode, tmp, frag4); - mark_inode_dirty(inode); - next3: - - UFSD("EXIT: ino %lu\n", inode->i_ino); - return retry; -} - - -static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) -{ - struct super_block * sb; - struct ufs_sb_private_info * uspi; - struct ufs_buffer_head * ind_ubh; - void *ind; - u64 tmp, indirect_block, i, frag_to_free; - unsigned free_count; - int retry; - - UFSD("ENTER: ino %lu, offset %llu, p: %p\n", - inode->i_ino, (unsigned long long)offset, p); - - BUG_ON(!p); - - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; - - frag_to_free = 0; - free_count = 0; - retry = 0; - - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp) - return 0; - ind_ubh = ubh_bread(sb, tmp, uspi->s_bsize); - if (tmp != ufs_data_ptr_to_cpu(sb, p)) { - ubh_brelse (ind_ubh); - return 1; - } - if (!ind_ubh) { - ufs_data_ptr_clear(uspi, p); - return 0; - } - - indirect_block = (DIRECT_BLOCK > offset) ? (DIRECT_BLOCK - offset) : 0; - for (i = indirect_block; i < uspi->s_apb; i++) { - ind = ubh_get_data_ptr(uspi, ind_ubh, i); - tmp = ufs_data_ptr_to_cpu(sb, ind); - if (!tmp) - continue; - - ufs_data_ptr_clear(uspi, ind); - ubh_mark_buffer_dirty(ind_ubh); - if (free_count == 0) { - frag_to_free = tmp; - free_count = uspi->s_fpb; - } else if (free_count > 0 && frag_to_free == tmp - free_count) - free_count += uspi->s_fpb; - else { - ufs_free_blocks (inode, frag_to_free, free_count); - frag_to_free = tmp; - free_count = uspi->s_fpb; - } - - mark_inode_dirty(inode); - } - - if (free_count > 0) { - ufs_free_blocks (inode, frag_to_free, free_count); - } - for (i = 0; i < uspi->s_apb; i++) - if (!ufs_is_data_ptr_zero(uspi, - ubh_get_data_ptr(uspi, ind_ubh, i))) - break; - if (i >= uspi->s_apb) { - tmp = ufs_data_ptr_to_cpu(sb, p); - ufs_data_ptr_clear(uspi, p); - - ufs_free_blocks (inode, tmp, uspi->s_fpb); - mark_inode_dirty(inode); - ubh_bforget(ind_ubh); - ind_ubh = NULL; - } - if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) - ubh_sync_block(ind_ubh); - ubh_brelse (ind_ubh); - - UFSD("EXIT: ino %lu\n", inode->i_ino); - - return retry; -} - -static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) -{ - struct super_block * sb; - struct ufs_sb_private_info * uspi; - struct ufs_buffer_head *dind_bh; - u64 i, tmp, dindirect_block; - void *dind; - int retry = 0; - - UFSD("ENTER: ino %lu\n", inode->i_ino); - - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; - - dindirect_block = (DIRECT_BLOCK > offset) - ? ((DIRECT_BLOCK - offset) >> uspi->s_apbshift) : 0; - retry = 0; - - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp) - return 0; - dind_bh = ubh_bread(sb, tmp, uspi->s_bsize); - if (tmp != ufs_data_ptr_to_cpu(sb, p)) { - ubh_brelse (dind_bh); - return 1; - } - if (!dind_bh) { - ufs_data_ptr_clear(uspi, p); - return 0; - } - - for (i = dindirect_block ; i < uspi->s_apb ; i++) { - dind = ubh_get_data_ptr(uspi, dind_bh, i); - tmp = ufs_data_ptr_to_cpu(sb, dind); - if (!tmp) - continue; - retry |= ufs_trunc_indirect (inode, offset + (i << uspi->s_apbshift), dind); - ubh_mark_buffer_dirty(dind_bh); - } - - for (i = 0; i < uspi->s_apb; i++) - if (!ufs_is_data_ptr_zero(uspi, - ubh_get_data_ptr(uspi, dind_bh, i))) - break; - if (i >= uspi->s_apb) { - tmp = ufs_data_ptr_to_cpu(sb, p); - ufs_data_ptr_clear(uspi, p); - - ufs_free_blocks(inode, tmp, uspi->s_fpb); - mark_inode_dirty(inode); - ubh_bforget(dind_bh); - dind_bh = NULL; - } - if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) - ubh_sync_block(dind_bh); - ubh_brelse (dind_bh); - - UFSD("EXIT: ino %lu\n", inode->i_ino); - - return retry; -} - -static int ufs_trunc_tindirect(struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - struct ufs_inode_info *ufsi = UFS_I(inode); - struct ufs_buffer_head * tind_bh; - u64 tindirect_block, tmp, i; - void *tind, *p; - int retry; - - UFSD("ENTER: ino %lu\n", inode->i_ino); - - retry = 0; - - tindirect_block = (DIRECT_BLOCK > (UFS_NDADDR + uspi->s_apb + uspi->s_2apb)) - ? ((DIRECT_BLOCK - UFS_NDADDR - uspi->s_apb - uspi->s_2apb) >> uspi->s_2apbshift) : 0; - - p = ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK); - if (!(tmp = ufs_data_ptr_to_cpu(sb, p))) - return 0; - tind_bh = ubh_bread (sb, tmp, uspi->s_bsize); - if (tmp != ufs_data_ptr_to_cpu(sb, p)) { - ubh_brelse (tind_bh); - return 1; - } - if (!tind_bh) { - ufs_data_ptr_clear(uspi, p); - return 0; - } - - for (i = tindirect_block ; i < uspi->s_apb ; i++) { - tind = ubh_get_data_ptr(uspi, tind_bh, i); - retry |= ufs_trunc_dindirect(inode, UFS_NDADDR + - uspi->s_apb + ((i + 1) << uspi->s_2apbshift), tind); - ubh_mark_buffer_dirty(tind_bh); - } - for (i = 0; i < uspi->s_apb; i++) - if (!ufs_is_data_ptr_zero(uspi, - ubh_get_data_ptr(uspi, tind_bh, i))) - break; - if (i >= uspi->s_apb) { - tmp = ufs_data_ptr_to_cpu(sb, p); - ufs_data_ptr_clear(uspi, p); - - ufs_free_blocks(inode, tmp, uspi->s_fpb); - mark_inode_dirty(inode); - ubh_bforget(tind_bh); - tind_bh = NULL; - } - if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) - ubh_sync_block(tind_bh); - ubh_brelse (tind_bh); - - UFSD("EXIT: ino %lu\n", inode->i_ino); - return retry; -} - -static int ufs_alloc_lastblock(struct inode *inode) -{ - int err = 0; - struct super_block *sb = inode->i_sb; - struct address_space *mapping = inode->i_mapping; - struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - unsigned i, end; - sector_t lastfrag; - struct page *lastpage; - struct buffer_head *bh; - u64 phys64; - - lastfrag = (i_size_read(inode) + uspi->s_fsize - 1) >> uspi->s_fshift; - - if (!lastfrag) - goto out; - - lastfrag--; - - lastpage = ufs_get_locked_page(mapping, lastfrag >> - (PAGE_CACHE_SHIFT - inode->i_blkbits)); - if (IS_ERR(lastpage)) { - err = -EIO; - goto out; - } - - end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1); - bh = page_buffers(lastpage); - for (i = 0; i < end; ++i) - bh = bh->b_this_page; - - - err = ufs_getfrag_block(inode, lastfrag, bh, 1); - - if (unlikely(err)) - goto out_unlock; - - if (buffer_new(bh)) { - clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); - /* - * we do not zeroize fragment, because of - * if it maped to hole, it already contains zeroes - */ - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - set_page_dirty(lastpage); - } - - if (lastfrag >= UFS_IND_FRAGMENT) { - end = uspi->s_fpb - ufs_fragnum(lastfrag) - 1; - phys64 = bh->b_blocknr + 1; - for (i = 0; i < end; ++i) { - bh = sb_getblk(sb, i + phys64); - lock_buffer(bh); - memset(bh->b_data, 0, sb->s_blocksize); - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - unlock_buffer(bh); - sync_dirty_buffer(bh); - brelse(bh); - } - } -out_unlock: - ufs_put_locked_page(lastpage); -out: - return err; -} - -int ufs_truncate(struct inode *inode, loff_t old_i_size) -{ - struct ufs_inode_info *ufsi = UFS_I(inode); - struct super_block *sb = inode->i_sb; - struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - int retry, err = 0; - - UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n", - inode->i_ino, (unsigned long long)i_size_read(inode), - (unsigned long long)old_i_size); - - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return -EINVAL; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return -EPERM; - - err = ufs_alloc_lastblock(inode); - - if (err) { - i_size_write(inode, old_i_size); - goto out; - } - - block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block); - - while (1) { - retry = ufs_trunc_direct(inode); - retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK, - ufs_get_direct_data_ptr(uspi, ufsi, - UFS_IND_BLOCK)); - retry |= ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb, - ufs_get_direct_data_ptr(uspi, ufsi, - UFS_DIND_BLOCK)); - retry |= ufs_trunc_tindirect (inode); - if (!retry) - break; - if (IS_SYNC(inode) && (inode->i_state & I_DIRTY)) - ufs_sync_inode (inode); - yield(); - } - - inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; - ufsi->i_lastfrag = DIRECT_FRAGMENT; - mark_inode_dirty(inode); -out: - UFSD("EXIT: err %d\n", err); - return err; -} - -int ufs_setattr(struct dentry *dentry, struct iattr *attr) -{ - struct inode *inode = d_inode(dentry); - unsigned int ia_valid = attr->ia_valid; - int error; - - error = inode_change_ok(inode, attr); - if (error) - return error; - - if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { - loff_t old_i_size = inode->i_size; - - /* XXX(truncate): truncate_setsize should be called last */ - truncate_setsize(inode, attr->ia_size); - - lock_ufs(inode->i_sb); - error = ufs_truncate(inode, old_i_size); - unlock_ufs(inode->i_sb); - if (error) - return error; - } - - setattr_copy(inode, attr); - mark_inode_dirty(inode); - return 0; -} - -const struct inode_operations ufs_file_inode_operations = { - .setattr = ufs_setattr, -}; diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 2a07396d5f9e..7da4aca868c0 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -24,12 +24,11 @@ struct ufs_sb_info { unsigned s_cgno[UFS_MAX_GROUP_LOADED]; unsigned short s_cg_loaded; unsigned s_mount_opt; - struct mutex mutex; - struct task_struct *mutex_owner; struct super_block *sb; int work_queued; /* non-zero if the delayed work is queued */ struct delayed_work sync_work; /* FS sync delayed work */ spinlock_t work_lock; /* protects sync_work and work_queued */ + struct mutex s_lock; }; struct ufs_inode_info { @@ -45,6 +44,8 @@ struct ufs_inode_info { __u32 i_oeftflag; __u16 i_osync; __u64 i_lastfrag; + seqlock_t meta_lock; + struct mutex truncate_mutex; __u32 i_dir_start_lookup; struct inode vfs_inode; }; @@ -105,7 +106,7 @@ extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page extern int ufs_empty_dir (struct inode *); extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **); extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct page *page, struct inode *inode); + struct page *page, struct inode *inode, bool update_times); /* file.c */ extern const struct inode_operations ufs_file_inode_operations; @@ -121,7 +122,7 @@ extern struct inode *ufs_iget(struct super_block *, unsigned long); extern int ufs_write_inode (struct inode *, struct writeback_control *); extern int ufs_sync_inode (struct inode *); extern void ufs_evict_inode (struct inode *); -extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create); +extern int ufs_setattr(struct dentry *dentry, struct iattr *attr); /* namei.c */ extern const struct file_operations ufs_dir_operations; @@ -139,10 +140,6 @@ void ufs_mark_sb_dirty(struct super_block *sb); extern const struct inode_operations ufs_fast_symlink_inode_operations; extern const struct inode_operations ufs_symlink_inode_operations; -/* truncate.c */ -extern int ufs_truncate (struct inode *, loff_t); -extern int ufs_setattr(struct dentry *dentry, struct iattr *attr); - static inline struct ufs_sb_info *UFS_SB(struct super_block *sb) { return sb->s_fs_info; @@ -169,7 +166,4 @@ static inline u32 ufs_dtogd(struct ufs_sb_private_info * uspi, u64 b) return do_div(b, uspi->s_fpg); } -extern void lock_ufs(struct super_block *sb); -extern void unlock_ufs(struct super_block *sb); - #endif /* _UFS_UFS_H */ diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c new file mode 100644 index 000000000000..634e676072cb --- /dev/null +++ b/fs/userfaultfd.c @@ -0,0 +1,1330 @@ +/* + * fs/userfaultfd.c + * + * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> + * Copyright (C) 2008-2009 Red Hat, Inc. + * Copyright (C) 2015 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Some part derived from fs/eventfd.c (anon inode setup) and + * mm/ksm.c (mm hashing). + */ + +#include <linux/hashtable.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/poll.h> +#include <linux/slab.h> +#include <linux/seq_file.h> +#include <linux/file.h> +#include <linux/bug.h> +#include <linux/anon_inodes.h> +#include <linux/syscalls.h> +#include <linux/userfaultfd_k.h> +#include <linux/mempolicy.h> +#include <linux/ioctl.h> +#include <linux/security.h> + +static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; + +enum userfaultfd_state { + UFFD_STATE_WAIT_API, + UFFD_STATE_RUNNING, +}; + +/* + * Start with fault_pending_wqh and fault_wqh so they're more likely + * to be in the same cacheline. + */ +struct userfaultfd_ctx { + /* waitqueue head for the pending (i.e. not read) userfaults */ + wait_queue_head_t fault_pending_wqh; + /* waitqueue head for the userfaults */ + wait_queue_head_t fault_wqh; + /* waitqueue head for the pseudo fd to wakeup poll/read */ + wait_queue_head_t fd_wqh; + /* a refile sequence protected by fault_pending_wqh lock */ + struct seqcount refile_seq; + /* pseudo fd refcounting */ + atomic_t refcount; + /* userfaultfd syscall flags */ + unsigned int flags; + /* state machine */ + enum userfaultfd_state state; + /* released */ + bool released; + /* mm with one ore more vmas attached to this userfaultfd_ctx */ + struct mm_struct *mm; +}; + +struct userfaultfd_wait_queue { + struct uffd_msg msg; + wait_queue_t wq; + struct userfaultfd_ctx *ctx; +}; + +struct userfaultfd_wake_range { + unsigned long start; + unsigned long len; +}; + +static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode, + int wake_flags, void *key) +{ + struct userfaultfd_wake_range *range = key; + int ret; + struct userfaultfd_wait_queue *uwq; + unsigned long start, len; + + uwq = container_of(wq, struct userfaultfd_wait_queue, wq); + ret = 0; + /* len == 0 means wake all */ + start = range->start; + len = range->len; + if (len && (start > uwq->msg.arg.pagefault.address || + start + len <= uwq->msg.arg.pagefault.address)) + goto out; + ret = wake_up_state(wq->private, mode); + if (ret) + /* + * Wake only once, autoremove behavior. + * + * After the effect of list_del_init is visible to the + * other CPUs, the waitqueue may disappear from under + * us, see the !list_empty_careful() in + * handle_userfault(). try_to_wake_up() has an + * implicit smp_mb__before_spinlock, and the + * wq->private is read before calling the extern + * function "wake_up_state" (which in turns calls + * try_to_wake_up). While the spin_lock;spin_unlock; + * wouldn't be enough, the smp_mb__before_spinlock is + * enough to avoid an explicit smp_mb() here. + */ + list_del_init(&wq->task_list); +out: + return ret; +} + +/** + * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd + * context. + * @ctx: [in] Pointer to the userfaultfd context. + * + * Returns: In case of success, returns not zero. + */ +static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) +{ + if (!atomic_inc_not_zero(&ctx->refcount)) + BUG(); +} + +/** + * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd + * context. + * @ctx: [in] Pointer to userfaultfd context. + * + * The userfaultfd context reference must have been previously acquired either + * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget(). + */ +static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) +{ + if (atomic_dec_and_test(&ctx->refcount)) { + VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock)); + VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh)); + VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock)); + VM_BUG_ON(waitqueue_active(&ctx->fault_wqh)); + VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock)); + VM_BUG_ON(waitqueue_active(&ctx->fd_wqh)); + mmput(ctx->mm); + kmem_cache_free(userfaultfd_ctx_cachep, ctx); + } +} + +static inline void msg_init(struct uffd_msg *msg) +{ + BUILD_BUG_ON(sizeof(struct uffd_msg) != 32); + /* + * Must use memset to zero out the paddings or kernel data is + * leaked to userland. + */ + memset(msg, 0, sizeof(struct uffd_msg)); +} + +static inline struct uffd_msg userfault_msg(unsigned long address, + unsigned int flags, + unsigned long reason) +{ + struct uffd_msg msg; + msg_init(&msg); + msg.event = UFFD_EVENT_PAGEFAULT; + msg.arg.pagefault.address = address; + if (flags & FAULT_FLAG_WRITE) + /* + * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the + * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE + * was not set in a UFFD_EVENT_PAGEFAULT, it means it + * was a read fault, otherwise if set it means it's + * a write fault. + */ + msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE; + if (reason & VM_UFFD_WP) + /* + * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the + * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was + * not set in a UFFD_EVENT_PAGEFAULT, it means it was + * a missing fault, otherwise if set it means it's a + * write protect fault. + */ + msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; + return msg; +} + +/* + * Verify the pagetables are still not ok after having reigstered into + * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any + * userfault that has already been resolved, if userfaultfd_read and + * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different + * threads. + */ +static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, + unsigned long address, + unsigned long flags, + unsigned long reason) +{ + struct mm_struct *mm = ctx->mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, _pmd; + pte_t *pte; + bool ret = true; + + VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + pmd = pmd_offset(pud, address); + /* + * READ_ONCE must function as a barrier with narrower scope + * and it must be equivalent to: + * _pmd = *pmd; barrier(); + * + * This is to deal with the instability (as in + * pmd_trans_unstable) of the pmd. + */ + _pmd = READ_ONCE(*pmd); + if (!pmd_present(_pmd)) + goto out; + + ret = false; + if (pmd_trans_huge(_pmd)) + goto out; + + /* + * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it + * and use the standard pte_offset_map() instead of parsing _pmd. + */ + pte = pte_offset_map(pmd, address); + /* + * Lockless access: we're in a wait_event so it's ok if it + * changes under us. + */ + if (pte_none(*pte)) + ret = true; + pte_unmap(pte); + +out: + return ret; +} + +/* + * The locking rules involved in returning VM_FAULT_RETRY depending on + * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and + * FAULT_FLAG_KILLABLE are not straightforward. The "Caution" + * recommendation in __lock_page_or_retry is not an understatement. + * + * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released + * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is + * not set. + * + * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not + * set, VM_FAULT_RETRY can still be returned if and only if there are + * fatal_signal_pending()s, and the mmap_sem must be released before + * returning it. + */ +int handle_userfault(struct vm_area_struct *vma, unsigned long address, + unsigned int flags, unsigned long reason) +{ + struct mm_struct *mm = vma->vm_mm; + struct userfaultfd_ctx *ctx; + struct userfaultfd_wait_queue uwq; + int ret; + bool must_wait, return_to_userland; + + BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + + ret = VM_FAULT_SIGBUS; + ctx = vma->vm_userfaultfd_ctx.ctx; + if (!ctx) + goto out; + + BUG_ON(ctx->mm != mm); + + VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP)); + VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP)); + + /* + * If it's already released don't get it. This avoids to loop + * in __get_user_pages if userfaultfd_release waits on the + * caller of handle_userfault to release the mmap_sem. + */ + if (unlikely(ACCESS_ONCE(ctx->released))) + goto out; + + /* + * Check that we can return VM_FAULT_RETRY. + * + * NOTE: it should become possible to return VM_FAULT_RETRY + * even if FAULT_FLAG_TRIED is set without leading to gup() + * -EBUSY failures, if the userfaultfd is to be extended for + * VM_UFFD_WP tracking and we intend to arm the userfault + * without first stopping userland access to the memory. For + * VM_UFFD_MISSING userfaults this is enough for now. + */ + if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) { + /* + * Validate the invariant that nowait must allow retry + * to be sure not to return SIGBUS erroneously on + * nowait invocations. + */ + BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT); +#ifdef CONFIG_DEBUG_VM + if (printk_ratelimit()) { + printk(KERN_WARNING + "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags); + dump_stack(); + } +#endif + goto out; + } + + /* + * Handle nowait, not much to do other than tell it to retry + * and wait. + */ + ret = VM_FAULT_RETRY; + if (flags & FAULT_FLAG_RETRY_NOWAIT) + goto out; + + /* take the reference before dropping the mmap_sem */ + userfaultfd_ctx_get(ctx); + + init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); + uwq.wq.private = current; + uwq.msg = userfault_msg(address, flags, reason); + uwq.ctx = ctx; + + return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == + (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); + + spin_lock(&ctx->fault_pending_wqh.lock); + /* + * After the __add_wait_queue the uwq is visible to userland + * through poll/read(). + */ + __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq); + /* + * The smp_mb() after __set_current_state prevents the reads + * following the spin_unlock to happen before the list_add in + * __add_wait_queue. + */ + set_current_state(return_to_userland ? TASK_INTERRUPTIBLE : + TASK_KILLABLE); + spin_unlock(&ctx->fault_pending_wqh.lock); + + must_wait = userfaultfd_must_wait(ctx, address, flags, reason); + up_read(&mm->mmap_sem); + + if (likely(must_wait && !ACCESS_ONCE(ctx->released) && + (return_to_userland ? !signal_pending(current) : + !fatal_signal_pending(current)))) { + wake_up_poll(&ctx->fd_wqh, POLLIN); + schedule(); + ret |= VM_FAULT_MAJOR; + } + + __set_current_state(TASK_RUNNING); + + if (return_to_userland) { + if (signal_pending(current) && + !fatal_signal_pending(current)) { + /* + * If we got a SIGSTOP or SIGCONT and this is + * a normal userland page fault, just let + * userland return so the signal will be + * handled and gdb debugging works. The page + * fault code immediately after we return from + * this function is going to release the + * mmap_sem and it's not depending on it + * (unlike gup would if we were not to return + * VM_FAULT_RETRY). + * + * If a fatal signal is pending we still take + * the streamlined VM_FAULT_RETRY failure path + * and there's no need to retake the mmap_sem + * in such case. + */ + down_read(&mm->mmap_sem); + ret = 0; + } + } + + /* + * Here we race with the list_del; list_add in + * userfaultfd_ctx_read(), however because we don't ever run + * list_del_init() to refile across the two lists, the prev + * and next pointers will never point to self. list_add also + * would never let any of the two pointers to point to + * self. So list_empty_careful won't risk to see both pointers + * pointing to self at any time during the list refile. The + * only case where list_del_init() is called is the full + * removal in the wake function and there we don't re-list_add + * and it's fine not to block on the spinlock. The uwq on this + * kernel stack can be released after the list_del_init. + */ + if (!list_empty_careful(&uwq.wq.task_list)) { + spin_lock(&ctx->fault_pending_wqh.lock); + /* + * No need of list_del_init(), the uwq on the stack + * will be freed shortly anyway. + */ + list_del(&uwq.wq.task_list); + spin_unlock(&ctx->fault_pending_wqh.lock); + } + + /* + * ctx may go away after this if the userfault pseudo fd is + * already released. + */ + userfaultfd_ctx_put(ctx); + +out: + return ret; +} + +static int userfaultfd_release(struct inode *inode, struct file *file) +{ + struct userfaultfd_ctx *ctx = file->private_data; + struct mm_struct *mm = ctx->mm; + struct vm_area_struct *vma, *prev; + /* len == 0 means wake all */ + struct userfaultfd_wake_range range = { .len = 0, }; + unsigned long new_flags; + + ACCESS_ONCE(ctx->released) = true; + + /* + * Flush page faults out of all CPUs. NOTE: all page faults + * must be retried without returning VM_FAULT_SIGBUS if + * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx + * changes while handle_userfault released the mmap_sem. So + * it's critical that released is set to true (above), before + * taking the mmap_sem for writing. + */ + down_write(&mm->mmap_sem); + prev = NULL; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + cond_resched(); + BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ + !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); + if (vma->vm_userfaultfd_ctx.ctx != ctx) { + prev = vma; + continue; + } + new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); + prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, + new_flags, vma->anon_vma, + vma->vm_file, vma->vm_pgoff, + vma_policy(vma), + NULL_VM_UFFD_CTX); + if (prev) + vma = prev; + else + prev = vma; + vma->vm_flags = new_flags; + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + } + up_write(&mm->mmap_sem); + + /* + * After no new page faults can wait on this fault_*wqh, flush + * the last page faults that may have been already waiting on + * the fault_*wqh. + */ + spin_lock(&ctx->fault_pending_wqh.lock); + __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, &range); + __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range); + spin_unlock(&ctx->fault_pending_wqh.lock); + + wake_up_poll(&ctx->fd_wqh, POLLHUP); + userfaultfd_ctx_put(ctx); + return 0; +} + +/* fault_pending_wqh.lock must be hold by the caller */ +static inline struct userfaultfd_wait_queue *find_userfault( + struct userfaultfd_ctx *ctx) +{ + wait_queue_t *wq; + struct userfaultfd_wait_queue *uwq; + + VM_BUG_ON(!spin_is_locked(&ctx->fault_pending_wqh.lock)); + + uwq = NULL; + if (!waitqueue_active(&ctx->fault_pending_wqh)) + goto out; + /* walk in reverse to provide FIFO behavior to read userfaults */ + wq = list_last_entry(&ctx->fault_pending_wqh.task_list, + typeof(*wq), task_list); + uwq = container_of(wq, struct userfaultfd_wait_queue, wq); +out: + return uwq; +} + +static unsigned int userfaultfd_poll(struct file *file, poll_table *wait) +{ + struct userfaultfd_ctx *ctx = file->private_data; + unsigned int ret; + + poll_wait(file, &ctx->fd_wqh, wait); + + switch (ctx->state) { + case UFFD_STATE_WAIT_API: + return POLLERR; + case UFFD_STATE_RUNNING: + /* + * poll() never guarantees that read won't block. + * userfaults can be waken before they're read(). + */ + if (unlikely(!(file->f_flags & O_NONBLOCK))) + return POLLERR; + /* + * lockless access to see if there are pending faults + * __pollwait last action is the add_wait_queue but + * the spin_unlock would allow the waitqueue_active to + * pass above the actual list_add inside + * add_wait_queue critical section. So use a full + * memory barrier to serialize the list_add write of + * add_wait_queue() with the waitqueue_active read + * below. + */ + ret = 0; + smp_mb(); + if (waitqueue_active(&ctx->fault_pending_wqh)) + ret = POLLIN; + return ret; + default: + BUG(); + } +} + +static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, + struct uffd_msg *msg) +{ + ssize_t ret; + DECLARE_WAITQUEUE(wait, current); + struct userfaultfd_wait_queue *uwq; + + /* always take the fd_wqh lock before the fault_pending_wqh lock */ + spin_lock(&ctx->fd_wqh.lock); + __add_wait_queue(&ctx->fd_wqh, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock(&ctx->fault_pending_wqh.lock); + uwq = find_userfault(ctx); + if (uwq) { + /* + * Use a seqcount to repeat the lockless check + * in wake_userfault() to avoid missing + * wakeups because during the refile both + * waitqueue could become empty if this is the + * only userfault. + */ + write_seqcount_begin(&ctx->refile_seq); + + /* + * The fault_pending_wqh.lock prevents the uwq + * to disappear from under us. + * + * Refile this userfault from + * fault_pending_wqh to fault_wqh, it's not + * pending anymore after we read it. + * + * Use list_del() by hand (as + * userfaultfd_wake_function also uses + * list_del_init() by hand) to be sure nobody + * changes __remove_wait_queue() to use + * list_del_init() in turn breaking the + * !list_empty_careful() check in + * handle_userfault(). The uwq->wq.task_list + * must never be empty at any time during the + * refile, or the waitqueue could disappear + * from under us. The "wait_queue_head_t" + * parameter of __remove_wait_queue() is unused + * anyway. + */ + list_del(&uwq->wq.task_list); + __add_wait_queue(&ctx->fault_wqh, &uwq->wq); + + write_seqcount_end(&ctx->refile_seq); + + /* careful to always initialize msg if ret == 0 */ + *msg = uwq->msg; + spin_unlock(&ctx->fault_pending_wqh.lock); + ret = 0; + break; + } + spin_unlock(&ctx->fault_pending_wqh.lock); + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + if (no_wait) { + ret = -EAGAIN; + break; + } + spin_unlock(&ctx->fd_wqh.lock); + schedule(); + spin_lock(&ctx->fd_wqh.lock); + } + __remove_wait_queue(&ctx->fd_wqh, &wait); + __set_current_state(TASK_RUNNING); + spin_unlock(&ctx->fd_wqh.lock); + + return ret; +} + +static ssize_t userfaultfd_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct userfaultfd_ctx *ctx = file->private_data; + ssize_t _ret, ret = 0; + struct uffd_msg msg; + int no_wait = file->f_flags & O_NONBLOCK; + + if (ctx->state == UFFD_STATE_WAIT_API) + return -EINVAL; + + for (;;) { + if (count < sizeof(msg)) + return ret ? ret : -EINVAL; + _ret = userfaultfd_ctx_read(ctx, no_wait, &msg); + if (_ret < 0) + return ret ? ret : _ret; + if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg))) + return ret ? ret : -EFAULT; + ret += sizeof(msg); + buf += sizeof(msg); + count -= sizeof(msg); + /* + * Allow to read more than one fault at time but only + * block if waiting for the very first one. + */ + no_wait = O_NONBLOCK; + } +} + +static void __wake_userfault(struct userfaultfd_ctx *ctx, + struct userfaultfd_wake_range *range) +{ + unsigned long start, end; + + start = range->start; + end = range->start + range->len; + + spin_lock(&ctx->fault_pending_wqh.lock); + /* wake all in the range and autoremove */ + if (waitqueue_active(&ctx->fault_pending_wqh)) + __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, + range); + if (waitqueue_active(&ctx->fault_wqh)) + __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range); + spin_unlock(&ctx->fault_pending_wqh.lock); +} + +static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, + struct userfaultfd_wake_range *range) +{ + unsigned seq; + bool need_wakeup; + + /* + * To be sure waitqueue_active() is not reordered by the CPU + * before the pagetable update, use an explicit SMP memory + * barrier here. PT lock release or up_read(mmap_sem) still + * have release semantics that can allow the + * waitqueue_active() to be reordered before the pte update. + */ + smp_mb(); + + /* + * Use waitqueue_active because it's very frequent to + * change the address space atomically even if there are no + * userfaults yet. So we take the spinlock only when we're + * sure we've userfaults to wake. + */ + do { + seq = read_seqcount_begin(&ctx->refile_seq); + need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) || + waitqueue_active(&ctx->fault_wqh); + cond_resched(); + } while (read_seqcount_retry(&ctx->refile_seq, seq)); + if (need_wakeup) + __wake_userfault(ctx, range); +} + +static __always_inline int validate_range(struct mm_struct *mm, + __u64 start, __u64 len) +{ + __u64 task_size = mm->task_size; + + if (start & ~PAGE_MASK) + return -EINVAL; + if (len & ~PAGE_MASK) + return -EINVAL; + if (!len) + return -EINVAL; + if (start < mmap_min_addr) + return -EINVAL; + if (start >= task_size) + return -EINVAL; + if (len > task_size - start) + return -EINVAL; + return 0; +} + +static int userfaultfd_register(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + struct mm_struct *mm = ctx->mm; + struct vm_area_struct *vma, *prev, *cur; + int ret; + struct uffdio_register uffdio_register; + struct uffdio_register __user *user_uffdio_register; + unsigned long vm_flags, new_flags; + bool found; + unsigned long start, end, vma_end; + + user_uffdio_register = (struct uffdio_register __user *) arg; + + ret = -EFAULT; + if (copy_from_user(&uffdio_register, user_uffdio_register, + sizeof(uffdio_register)-sizeof(__u64))) + goto out; + + ret = -EINVAL; + if (!uffdio_register.mode) + goto out; + if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING| + UFFDIO_REGISTER_MODE_WP)) + goto out; + vm_flags = 0; + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) + vm_flags |= VM_UFFD_MISSING; + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { + vm_flags |= VM_UFFD_WP; + /* + * FIXME: remove the below error constraint by + * implementing the wprotect tracking mode. + */ + ret = -EINVAL; + goto out; + } + + ret = validate_range(mm, uffdio_register.range.start, + uffdio_register.range.len); + if (ret) + goto out; + + start = uffdio_register.range.start; + end = start + uffdio_register.range.len; + + down_write(&mm->mmap_sem); + vma = find_vma_prev(mm, start, &prev); + + ret = -ENOMEM; + if (!vma) + goto out_unlock; + + /* check that there's at least one vma in the range */ + ret = -EINVAL; + if (vma->vm_start >= end) + goto out_unlock; + + /* + * Search for not compatible vmas. + * + * FIXME: this shall be relaxed later so that it doesn't fail + * on tmpfs backed vmas (in addition to the current allowance + * on anonymous vmas). + */ + found = false; + for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { + cond_resched(); + + BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ + !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); + + /* check not compatible vmas */ + ret = -EINVAL; + if (cur->vm_ops) + goto out_unlock; + + /* + * Check that this vma isn't already owned by a + * different userfaultfd. We can't allow more than one + * userfaultfd to own a single vma simultaneously or we + * wouldn't know which one to deliver the userfaults to. + */ + ret = -EBUSY; + if (cur->vm_userfaultfd_ctx.ctx && + cur->vm_userfaultfd_ctx.ctx != ctx) + goto out_unlock; + + found = true; + } + BUG_ON(!found); + + if (vma->vm_start < start) + prev = vma; + + ret = 0; + do { + cond_resched(); + + BUG_ON(vma->vm_ops); + BUG_ON(vma->vm_userfaultfd_ctx.ctx && + vma->vm_userfaultfd_ctx.ctx != ctx); + + /* + * Nothing to do: this vma is already registered into this + * userfaultfd and with the right tracking mode too. + */ + if (vma->vm_userfaultfd_ctx.ctx == ctx && + (vma->vm_flags & vm_flags) == vm_flags) + goto skip; + + if (vma->vm_start > start) + start = vma->vm_start; + vma_end = min(end, vma->vm_end); + + new_flags = (vma->vm_flags & ~vm_flags) | vm_flags; + prev = vma_merge(mm, prev, start, vma_end, new_flags, + vma->anon_vma, vma->vm_file, vma->vm_pgoff, + vma_policy(vma), + ((struct vm_userfaultfd_ctx){ ctx })); + if (prev) { + vma = prev; + goto next; + } + if (vma->vm_start < start) { + ret = split_vma(mm, vma, start, 1); + if (ret) + break; + } + if (vma->vm_end > end) { + ret = split_vma(mm, vma, end, 0); + if (ret) + break; + } + next: + /* + * In the vma_merge() successful mprotect-like case 8: + * the next vma was merged into the current one and + * the current one has not been updated yet. + */ + vma->vm_flags = new_flags; + vma->vm_userfaultfd_ctx.ctx = ctx; + + skip: + prev = vma; + start = vma->vm_end; + vma = vma->vm_next; + } while (vma && vma->vm_start < end); +out_unlock: + up_write(&mm->mmap_sem); + if (!ret) { + /* + * Now that we scanned all vmas we can already tell + * userland which ioctls methods are guaranteed to + * succeed on this range. + */ + if (put_user(UFFD_API_RANGE_IOCTLS, + &user_uffdio_register->ioctls)) + ret = -EFAULT; + } +out: + return ret; +} + +static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + struct mm_struct *mm = ctx->mm; + struct vm_area_struct *vma, *prev, *cur; + int ret; + struct uffdio_range uffdio_unregister; + unsigned long new_flags; + bool found; + unsigned long start, end, vma_end; + const void __user *buf = (void __user *)arg; + + ret = -EFAULT; + if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) + goto out; + + ret = validate_range(mm, uffdio_unregister.start, + uffdio_unregister.len); + if (ret) + goto out; + + start = uffdio_unregister.start; + end = start + uffdio_unregister.len; + + down_write(&mm->mmap_sem); + vma = find_vma_prev(mm, start, &prev); + + ret = -ENOMEM; + if (!vma) + goto out_unlock; + + /* check that there's at least one vma in the range */ + ret = -EINVAL; + if (vma->vm_start >= end) + goto out_unlock; + + /* + * Search for not compatible vmas. + * + * FIXME: this shall be relaxed later so that it doesn't fail + * on tmpfs backed vmas (in addition to the current allowance + * on anonymous vmas). + */ + found = false; + ret = -EINVAL; + for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { + cond_resched(); + + BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ + !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); + + /* + * Check not compatible vmas, not strictly required + * here as not compatible vmas cannot have an + * userfaultfd_ctx registered on them, but this + * provides for more strict behavior to notice + * unregistration errors. + */ + if (cur->vm_ops) + goto out_unlock; + + found = true; + } + BUG_ON(!found); + + if (vma->vm_start < start) + prev = vma; + + ret = 0; + do { + cond_resched(); + + BUG_ON(vma->vm_ops); + + /* + * Nothing to do: this vma is already registered into this + * userfaultfd and with the right tracking mode too. + */ + if (!vma->vm_userfaultfd_ctx.ctx) + goto skip; + + if (vma->vm_start > start) + start = vma->vm_start; + vma_end = min(end, vma->vm_end); + + new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); + prev = vma_merge(mm, prev, start, vma_end, new_flags, + vma->anon_vma, vma->vm_file, vma->vm_pgoff, + vma_policy(vma), + NULL_VM_UFFD_CTX); + if (prev) { + vma = prev; + goto next; + } + if (vma->vm_start < start) { + ret = split_vma(mm, vma, start, 1); + if (ret) + break; + } + if (vma->vm_end > end) { + ret = split_vma(mm, vma, end, 0); + if (ret) + break; + } + next: + /* + * In the vma_merge() successful mprotect-like case 8: + * the next vma was merged into the current one and + * the current one has not been updated yet. + */ + vma->vm_flags = new_flags; + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + + skip: + prev = vma; + start = vma->vm_end; + vma = vma->vm_next; + } while (vma && vma->vm_start < end); +out_unlock: + up_write(&mm->mmap_sem); +out: + return ret; +} + +/* + * userfaultfd_wake may be used in combination with the + * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches. + */ +static int userfaultfd_wake(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + int ret; + struct uffdio_range uffdio_wake; + struct userfaultfd_wake_range range; + const void __user *buf = (void __user *)arg; + + ret = -EFAULT; + if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) + goto out; + + ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len); + if (ret) + goto out; + + range.start = uffdio_wake.start; + range.len = uffdio_wake.len; + + /* + * len == 0 means wake all and we don't want to wake all here, + * so check it again to be sure. + */ + VM_BUG_ON(!range.len); + + wake_userfault(ctx, &range); + ret = 0; + +out: + return ret; +} + +static int userfaultfd_copy(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + __s64 ret; + struct uffdio_copy uffdio_copy; + struct uffdio_copy __user *user_uffdio_copy; + struct userfaultfd_wake_range range; + + user_uffdio_copy = (struct uffdio_copy __user *) arg; + + ret = -EFAULT; + if (copy_from_user(&uffdio_copy, user_uffdio_copy, + /* don't copy "copy" last field */ + sizeof(uffdio_copy)-sizeof(__s64))) + goto out; + + ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); + if (ret) + goto out; + /* + * double check for wraparound just in case. copy_from_user() + * will later check uffdio_copy.src + uffdio_copy.len to fit + * in the userland range. + */ + ret = -EINVAL; + if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src) + goto out; + if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE) + goto out; + + ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, + uffdio_copy.len); + if (unlikely(put_user(ret, &user_uffdio_copy->copy))) + return -EFAULT; + if (ret < 0) + goto out; + BUG_ON(!ret); + /* len == 0 would wake all */ + range.len = ret; + if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) { + range.start = uffdio_copy.dst; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_copy.len ? 0 : -EAGAIN; +out: + return ret; +} + +static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + __s64 ret; + struct uffdio_zeropage uffdio_zeropage; + struct uffdio_zeropage __user *user_uffdio_zeropage; + struct userfaultfd_wake_range range; + + user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; + + ret = -EFAULT; + if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, + /* don't copy "zeropage" last field */ + sizeof(uffdio_zeropage)-sizeof(__s64))) + goto out; + + ret = validate_range(ctx->mm, uffdio_zeropage.range.start, + uffdio_zeropage.range.len); + if (ret) + goto out; + ret = -EINVAL; + if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) + goto out; + + ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, + uffdio_zeropage.range.len); + if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) + return -EFAULT; + if (ret < 0) + goto out; + /* len == 0 would wake all */ + BUG_ON(!ret); + range.len = ret; + if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) { + range.start = uffdio_zeropage.range.start; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN; +out: + return ret; +} + +/* + * userland asks for a certain API version and we return which bits + * and ioctl commands are implemented in this kernel for such API + * version or -EINVAL if unknown. + */ +static int userfaultfd_api(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + struct uffdio_api uffdio_api; + void __user *buf = (void __user *)arg; + int ret; + + ret = -EINVAL; + if (ctx->state != UFFD_STATE_WAIT_API) + goto out; + ret = -EFAULT; + if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) + goto out; + if (uffdio_api.api != UFFD_API || uffdio_api.features) { + memset(&uffdio_api, 0, sizeof(uffdio_api)); + if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) + goto out; + ret = -EINVAL; + goto out; + } + uffdio_api.features = UFFD_API_FEATURES; + uffdio_api.ioctls = UFFD_API_IOCTLS; + ret = -EFAULT; + if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) + goto out; + ctx->state = UFFD_STATE_RUNNING; + ret = 0; +out: + return ret; +} + +static long userfaultfd_ioctl(struct file *file, unsigned cmd, + unsigned long arg) +{ + int ret = -EINVAL; + struct userfaultfd_ctx *ctx = file->private_data; + + if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API) + return -EINVAL; + + switch(cmd) { + case UFFDIO_API: + ret = userfaultfd_api(ctx, arg); + break; + case UFFDIO_REGISTER: + ret = userfaultfd_register(ctx, arg); + break; + case UFFDIO_UNREGISTER: + ret = userfaultfd_unregister(ctx, arg); + break; + case UFFDIO_WAKE: + ret = userfaultfd_wake(ctx, arg); + break; + case UFFDIO_COPY: + ret = userfaultfd_copy(ctx, arg); + break; + case UFFDIO_ZEROPAGE: + ret = userfaultfd_zeropage(ctx, arg); + break; + } + return ret; +} + +#ifdef CONFIG_PROC_FS +static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct userfaultfd_ctx *ctx = f->private_data; + wait_queue_t *wq; + struct userfaultfd_wait_queue *uwq; + unsigned long pending = 0, total = 0; + + spin_lock(&ctx->fault_pending_wqh.lock); + list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) { + uwq = container_of(wq, struct userfaultfd_wait_queue, wq); + pending++; + total++; + } + list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) { + uwq = container_of(wq, struct userfaultfd_wait_queue, wq); + total++; + } + spin_unlock(&ctx->fault_pending_wqh.lock); + + /* + * If more protocols will be added, there will be all shown + * separated by a space. Like this: + * protocols: aa:... bb:... + */ + seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n", + pending, total, UFFD_API, UFFD_API_FEATURES, + UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS); +} +#endif + +static const struct file_operations userfaultfd_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = userfaultfd_show_fdinfo, +#endif + .release = userfaultfd_release, + .poll = userfaultfd_poll, + .read = userfaultfd_read, + .unlocked_ioctl = userfaultfd_ioctl, + .compat_ioctl = userfaultfd_ioctl, + .llseek = noop_llseek, +}; + +static void init_once_userfaultfd_ctx(void *mem) +{ + struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem; + + init_waitqueue_head(&ctx->fault_pending_wqh); + init_waitqueue_head(&ctx->fault_wqh); + init_waitqueue_head(&ctx->fd_wqh); + seqcount_init(&ctx->refile_seq); +} + +/** + * userfaultfd_file_create - Creates an userfaultfd file pointer. + * @flags: Flags for the userfaultfd file. + * + * This function creates an userfaultfd file pointer, w/out installing + * it into the fd table. This is useful when the userfaultfd file is + * used during the initialization of data structures that require + * extra setup after the userfaultfd creation. So the userfaultfd + * creation is split into the file pointer creation phase, and the + * file descriptor installation phase. In this way races with + * userspace closing the newly installed file descriptor can be + * avoided. Returns an userfaultfd file pointer, or a proper error + * pointer. + */ +static struct file *userfaultfd_file_create(int flags) +{ + struct file *file; + struct userfaultfd_ctx *ctx; + + BUG_ON(!current->mm); + + /* Check the UFFD_* constants for consistency. */ + BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK); + + file = ERR_PTR(-EINVAL); + if (flags & ~UFFD_SHARED_FCNTL_FLAGS) + goto out; + + file = ERR_PTR(-ENOMEM); + ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); + if (!ctx) + goto out; + + atomic_set(&ctx->refcount, 1); + ctx->flags = flags; + ctx->state = UFFD_STATE_WAIT_API; + ctx->released = false; + ctx->mm = current->mm; + /* prevent the mm struct to be freed */ + atomic_inc(&ctx->mm->mm_users); + + file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx, + O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS)); + if (IS_ERR(file)) + kmem_cache_free(userfaultfd_ctx_cachep, ctx); +out: + return file; +} + +SYSCALL_DEFINE1(userfaultfd, int, flags) +{ + int fd, error; + struct file *file; + + error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS); + if (error < 0) + return error; + fd = error; + + file = userfaultfd_file_create(flags); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_put_unused_fd; + } + fd_install(fd, file); + + return fd; + +err_put_unused_fd: + put_unused_fd(fd); + + return error; +} + +static int __init userfaultfd_init(void) +{ + userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache", + sizeof(struct userfaultfd_ctx), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + init_once_userfaultfd_ctx); + return 0; +} +__initcall(userfaultfd_init); diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index df6828570e87..a096841bd06c 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -33,6 +33,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_attr.o \ xfs_attr_leaf.o \ xfs_attr_remote.o \ + xfs_bit.o \ xfs_bmap.o \ xfs_bmap_btree.o \ xfs_btree.o \ @@ -63,7 +64,6 @@ xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \ xfs-y += xfs_aops.o \ xfs_attr_inactive.o \ xfs_attr_list.o \ - xfs_bit.o \ xfs_bmap_util.o \ xfs_buf.o \ xfs_dir2_readdir.o \ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 516162be1398..ffad7f20342f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -149,13 +149,27 @@ xfs_alloc_compute_aligned( { xfs_agblock_t bno; xfs_extlen_t len; + xfs_extlen_t diff; /* Trim busy sections out of found extent */ xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len); + /* + * If we have a largish extent that happens to start before min_agbno, + * see if we can shift it into range... + */ + if (bno < args->min_agbno && bno + len > args->min_agbno) { + diff = args->min_agbno - bno; + if (len > diff) { + bno += diff; + len -= diff; + } + } + if (args->alignment > 1 && len >= args->minlen) { xfs_agblock_t aligned_bno = roundup(bno, args->alignment); - xfs_extlen_t diff = aligned_bno - bno; + + diff = aligned_bno - bno; *resbno = aligned_bno; *reslen = diff >= len ? 0 : len - diff; @@ -450,7 +464,7 @@ xfs_agfl_verify( struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); int i; - if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC) return false; @@ -795,9 +809,13 @@ xfs_alloc_find_best_extent( * The good extent is closer than this one. */ if (!dir) { + if (*sbnoa > args->max_agbno) + goto out_use_good; if (*sbnoa >= args->agbno + gdiff) goto out_use_good; } else { + if (*sbnoa < args->min_agbno) + goto out_use_good; if (*sbnoa <= args->agbno - gdiff) goto out_use_good; } @@ -884,6 +902,17 @@ xfs_alloc_ag_vextent_near( dofirst = prandom_u32() & 1; #endif + /* handle unitialized agbno range so caller doesn't have to */ + if (!args->min_agbno && !args->max_agbno) + args->max_agbno = args->mp->m_sb.sb_agblocks - 1; + ASSERT(args->min_agbno <= args->max_agbno); + + /* clamp agbno to the range if it's outside */ + if (args->agbno < args->min_agbno) + args->agbno = args->min_agbno; + if (args->agbno > args->max_agbno) + args->agbno = args->max_agbno; + restart: bno_cur_lt = NULL; bno_cur_gt = NULL; @@ -976,6 +1005,8 @@ restart: <bnoa, <lena); if (ltlena < args->minlen) continue; + if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno) + continue; args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); ASSERT(args->len >= args->minlen); @@ -1096,11 +1127,11 @@ restart: XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, ltbno, ltlen, <bnoa, <lena); - if (ltlena >= args->minlen) + if (ltlena >= args->minlen && ltbnoa >= args->min_agbno) break; if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) goto error0; - if (!i) { + if (!i || ltbnoa < args->min_agbno) { xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); bno_cur_lt = NULL; @@ -1112,11 +1143,11 @@ restart: XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, gtbno, gtlen, >bnoa, >lena); - if (gtlena >= args->minlen) + if (gtlena >= args->minlen && gtbnoa <= args->max_agbno) break; if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) goto error0; - if (!i) { + if (!i || gtbnoa > args->max_agbno) { xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR); bno_cur_gt = NULL; @@ -1216,6 +1247,7 @@ restart: ASSERT(ltnew >= ltbno); ASSERT(ltnew + rlen <= ltbnoa + ltlena); ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + ASSERT(ltnew >= args->min_agbno && ltnew <= args->max_agbno); args->agbno = ltnew; if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, @@ -1825,11 +1857,11 @@ xfs_alloc_compute_maxlevels( xfs_extlen_t xfs_alloc_longest_free_extent( struct xfs_mount *mp, - struct xfs_perag *pag) + struct xfs_perag *pag, + xfs_extlen_t need) { - xfs_extlen_t need, delta = 0; + xfs_extlen_t delta = 0; - need = XFS_MIN_FREELIST_PAG(pag, mp); if (need > pag->pagf_flcount) delta = need - pag->pagf_flcount; @@ -1838,131 +1870,150 @@ xfs_alloc_longest_free_extent( return pag->pagf_flcount > 0 || pag->pagf_longest > 0; } +unsigned int +xfs_alloc_min_freelist( + struct xfs_mount *mp, + struct xfs_perag *pag) +{ + unsigned int min_free; + + /* space needed by-bno freespace btree */ + min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1, + mp->m_ag_maxlevels); + /* space needed by-size freespace btree */ + min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1, + mp->m_ag_maxlevels); + + return min_free; +} + +/* + * Check if the operation we are fixing up the freelist for should go ahead or + * not. If we are freeing blocks, we always allow it, otherwise the allocation + * is dependent on whether the size and shape of free space available will + * permit the requested allocation to take place. + */ +static bool +xfs_alloc_space_available( + struct xfs_alloc_arg *args, + xfs_extlen_t min_free, + int flags) +{ + struct xfs_perag *pag = args->pag; + xfs_extlen_t longest; + int available; + + if (flags & XFS_ALLOC_FLAG_FREEING) + return true; + + /* do we have enough contiguous free space for the allocation? */ + longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free); + if ((args->minlen + args->alignment + args->minalignslop - 1) > longest) + return false; + + /* do have enough free space remaining for the allocation? */ + available = (int)(pag->pagf_freeblks + pag->pagf_flcount - + min_free - args->total); + if (available < (int)args->minleft) + return false; + + return true; +} + /* * Decide whether to use this allocation group for this allocation. * If so, fix up the btree freelist's size. */ STATIC int /* error */ xfs_alloc_fix_freelist( - xfs_alloc_arg_t *args, /* allocation argument structure */ - int flags) /* XFS_ALLOC_FLAG_... */ + struct xfs_alloc_arg *args, /* allocation argument structure */ + int flags) /* XFS_ALLOC_FLAG_... */ { - xfs_buf_t *agbp; /* agf buffer pointer */ - xfs_agf_t *agf; /* a.g. freespace structure pointer */ - xfs_buf_t *agflbp;/* agfl buffer pointer */ - xfs_agblock_t bno; /* freelist block */ - xfs_extlen_t delta; /* new blocks needed in freelist */ - int error; /* error result code */ - xfs_extlen_t longest;/* longest extent in allocation group */ - xfs_mount_t *mp; /* file system mount point structure */ - xfs_extlen_t need; /* total blocks needed in freelist */ - xfs_perag_t *pag; /* per-ag information structure */ - xfs_alloc_arg_t targs; /* local allocation arguments */ - xfs_trans_t *tp; /* transaction pointer */ + struct xfs_mount *mp = args->mp; + struct xfs_perag *pag = args->pag; + struct xfs_trans *tp = args->tp; + struct xfs_buf *agbp = NULL; + struct xfs_buf *agflbp = NULL; + struct xfs_alloc_arg targs; /* local allocation arguments */ + xfs_agblock_t bno; /* freelist block */ + xfs_extlen_t need; /* total blocks needed in freelist */ + int error = 0; - mp = args->mp; - - pag = args->pag; - tp = args->tp; if (!pag->pagf_init) { - if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags, - &agbp))) - return error; + error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); + if (error) + goto out_no_agbp; if (!pag->pagf_init) { ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); - args->agbp = NULL; - return 0; + goto out_agbp_relse; } - } else - agbp = NULL; + } /* - * If this is a metadata preferred pag and we are user data - * then try somewhere else if we are not being asked to - * try harder at this point + * If this is a metadata preferred pag and we are user data then try + * somewhere else if we are not being asked to try harder at this + * point */ if (pag->pagf_metadata && args->userdata && (flags & XFS_ALLOC_FLAG_TRYLOCK)) { ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); - args->agbp = NULL; - return 0; + goto out_agbp_relse; } - if (!(flags & XFS_ALLOC_FLAG_FREEING)) { - /* - * If it looks like there isn't a long enough extent, or enough - * total blocks, reject it. - */ - need = XFS_MIN_FREELIST_PAG(pag, mp); - longest = xfs_alloc_longest_free_extent(mp, pag); - if ((args->minlen + args->alignment + args->minalignslop - 1) > - longest || - ((int)(pag->pagf_freeblks + pag->pagf_flcount - - need - args->total) < (int)args->minleft)) { - if (agbp) - xfs_trans_brelse(tp, agbp); - args->agbp = NULL; - return 0; - } - } + need = xfs_alloc_min_freelist(mp, pag); + if (!xfs_alloc_space_available(args, need, flags)) + goto out_agbp_relse; /* * Get the a.g. freespace buffer. * Can fail if we're not blocking on locks, and it's held. */ - if (agbp == NULL) { - if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags, - &agbp))) - return error; - if (agbp == NULL) { + if (!agbp) { + error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); + if (error) + goto out_no_agbp; + if (!agbp) { ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); - args->agbp = NULL; - return 0; - } - } - /* - * Figure out how many blocks we should have in the freelist. - */ - agf = XFS_BUF_TO_AGF(agbp); - need = XFS_MIN_FREELIST(agf, mp); - /* - * If there isn't enough total or single-extent, reject it. - */ - if (!(flags & XFS_ALLOC_FLAG_FREEING)) { - delta = need > be32_to_cpu(agf->agf_flcount) ? - (need - be32_to_cpu(agf->agf_flcount)) : 0; - longest = be32_to_cpu(agf->agf_longest); - longest = (longest > delta) ? (longest - delta) : - (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0); - if ((args->minlen + args->alignment + args->minalignslop - 1) > - longest || - ((int)(be32_to_cpu(agf->agf_freeblks) + - be32_to_cpu(agf->agf_flcount) - need - args->total) < - (int)args->minleft)) { - xfs_trans_brelse(tp, agbp); - args->agbp = NULL; - return 0; + goto out_no_agbp; } } + + /* If there isn't enough total space or single-extent, reject it. */ + need = xfs_alloc_min_freelist(mp, pag); + if (!xfs_alloc_space_available(args, need, flags)) + goto out_agbp_relse; + /* * Make the freelist shorter if it's too long. + * + * Note that from this point onwards, we will always release the agf and + * agfl buffers on error. This handles the case where we error out and + * the buffers are clean or may not have been joined to the transaction + * and hence need to be released manually. If they have been joined to + * the transaction, then xfs_trans_brelse() will handle them + * appropriately based on the recursion count and dirty state of the + * buffer. + * + * XXX (dgc): When we have lots of free space, does this buy us + * anything other than extra overhead when we need to put more blocks + * back on the free list? Maybe we should only do this when space is + * getting low or the AGFL is more than half full? */ - while (be32_to_cpu(agf->agf_flcount) > need) { - xfs_buf_t *bp; + while (pag->pagf_flcount > need) { + struct xfs_buf *bp; error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); if (error) - return error; - if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1))) - return error; + goto out_agbp_relse; + error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1); + if (error) + goto out_agbp_relse; bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); xfs_trans_binval(tp, bp); } - /* - * Initialize the args structure. - */ + memset(&targs, 0, sizeof(targs)); targs.tp = tp; targs.mp = mp; @@ -1971,21 +2022,20 @@ xfs_alloc_fix_freelist( targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; targs.type = XFS_ALLOCTYPE_THIS_AG; targs.pag = pag; - if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp))) - return error; - /* - * Make the freelist longer if it's too short. - */ - while (be32_to_cpu(agf->agf_flcount) < need) { + error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp); + if (error) + goto out_agbp_relse; + + /* Make the freelist longer if it's too short. */ + while (pag->pagf_flcount < need) { targs.agbno = 0; - targs.maxlen = need - be32_to_cpu(agf->agf_flcount); - /* - * Allocate as many blocks as possible at once. - */ - if ((error = xfs_alloc_ag_vextent(&targs))) { - xfs_trans_brelse(tp, agflbp); - return error; - } + targs.maxlen = need - pag->pagf_flcount; + + /* Allocate as many blocks as possible at once. */ + error = xfs_alloc_ag_vextent(&targs); + if (error) + goto out_agflbp_relse; + /* * Stop if we run out. Won't happen if callers are obeying * the restrictions correctly. Can happen for free calls @@ -1994,9 +2044,7 @@ xfs_alloc_fix_freelist( if (targs.agbno == NULLAGBLOCK) { if (flags & XFS_ALLOC_FLAG_FREEING) break; - xfs_trans_brelse(tp, agflbp); - args->agbp = NULL; - return 0; + goto out_agflbp_relse; } /* * Put each allocated block on the list. @@ -2005,12 +2053,21 @@ xfs_alloc_fix_freelist( error = xfs_alloc_put_freelist(tp, agbp, agflbp, bno, 0); if (error) - return error; + goto out_agflbp_relse; } } xfs_trans_brelse(tp, agflbp); args->agbp = agbp; return 0; + +out_agflbp_relse: + xfs_trans_brelse(tp, agflbp); +out_agbp_relse: + if (agbp) + xfs_trans_brelse(tp, agbp); +out_no_agbp: + args->agbp = NULL; + return error; } /* @@ -2203,7 +2260,7 @@ xfs_agf_verify( struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); if (xfs_sb_version_hascrc(&mp->m_sb) && - !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid)) + !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index d1b4b6a5c894..ca1c8168373a 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -112,6 +112,8 @@ typedef struct xfs_alloc_arg { xfs_extlen_t total; /* total blocks needed in xaction */ xfs_extlen_t alignment; /* align answer to multiple of this */ xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */ + xfs_agblock_t min_agbno; /* set an agbno range for NEAR allocs */ + xfs_agblock_t max_agbno; /* ... */ xfs_extlen_t len; /* output: actual size of extent */ xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */ xfs_alloctype_t otype; /* original allocation type */ @@ -128,11 +130,9 @@ typedef struct xfs_alloc_arg { #define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/ #define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */ -/* - * Find the length of the longest extent in an AG. - */ -xfs_extlen_t -xfs_alloc_longest_free_extent(struct xfs_mount *mp, +xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, + struct xfs_perag *pag, xfs_extlen_t need); +unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, struct xfs_perag *pag); /* diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 59d521c09a17..90de071dd4c2 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -295,7 +295,7 @@ xfs_allocbt_verify( case cpu_to_be32(XFS_ABTB_CRC_MAGIC): if (!xfs_sb_version_hascrc(&mp->m_sb)) return false; - if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) return false; @@ -313,7 +313,7 @@ xfs_allocbt_verify( case cpu_to_be32(XFS_ABTC_CRC_MAGIC): if (!xfs_sb_version_hascrc(&mp->m_sb)) return false; - if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) return false; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 0a472fbe06d4..ff065578969f 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -139,6 +139,8 @@ xfs_attr_get( args.value = value; args.valuelen = *valuelenp; + /* Entirely possible to look up a name which doesn't exist */ + args.op_flags = XFS_DA_OP_OKNOENT; lock_mode = xfs_ilock_attr_map_shared(ip); if (!xfs_inode_hasattr(ip)) @@ -266,7 +268,7 @@ xfs_attr_set( tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; error = xfs_trans_reserve(args.trans, &tres, args.total, 0); if (error) { - xfs_trans_cancel(args.trans, 0); + xfs_trans_cancel(args.trans); return error; } xfs_ilock(dp, XFS_ILOCK_EXCL); @@ -276,7 +278,7 @@ xfs_attr_set( XFS_QMOPT_RES_REGBLKS); if (error) { xfs_iunlock(dp, XFS_ILOCK_EXCL); - xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES); + xfs_trans_cancel(args.trans); return error; } @@ -320,8 +322,7 @@ xfs_attr_set( xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); } - err2 = xfs_trans_commit(args.trans, - XFS_TRANS_RELEASE_LOG_RES); + err2 = xfs_trans_commit(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error ? error : err2; @@ -383,16 +384,14 @@ xfs_attr_set( * Commit the last in the sequence of transactions. */ xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; out: - if (args.trans) { - xfs_trans_cancel(args.trans, - XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - } + if (args.trans) + xfs_trans_cancel(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; } @@ -462,7 +461,7 @@ xfs_attr_remove( error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm, XFS_ATTRRM_SPACE_RES(mp), 0); if (error) { - xfs_trans_cancel(args.trans, 0); + xfs_trans_cancel(args.trans); return error; } @@ -501,16 +500,14 @@ xfs_attr_remove( * Commit the last in the sequence of transactions. */ xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; out: - if (args.trans) { - xfs_trans_cancel(args.trans, - XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - } + if (args.trans) + xfs_trans_cancel(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index e9d401ce93bb..33df52d97ec7 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -262,7 +262,7 @@ xfs_attr3_leaf_verify( if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC) return false; - if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) return false; @@ -1056,7 +1056,7 @@ xfs_attr3_leaf_create( hdr3->blkno = cpu_to_be64(bp->b_bn); hdr3->owner = cpu_to_be64(dp->i_ino); - uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr); } else { diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 20de88d1bf86..f38f9bd81557 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -100,7 +100,7 @@ xfs_attr3_rmt_verify( return false; if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC)) return false; - if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(rmt->rm_blkno) != bno) return false; @@ -159,11 +159,10 @@ xfs_attr3_rmt_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + int blksize = mp->m_attr_geo->blksize; char *ptr; int len; xfs_daddr_t bno; - int blksize = mp->m_attr_geo->blksize; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) @@ -175,16 +174,22 @@ xfs_attr3_rmt_write_verify( ASSERT(len >= blksize); while (len > 0) { + struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr; + if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; } - if (bip) { - struct xfs_attr3_rmt_hdr *rmt; - rmt = (struct xfs_attr3_rmt_hdr *)ptr; - rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); + /* + * Ensure we aren't writing bogus LSNs to disk. See + * xfs_attr3_rmt_hdr_set() for the explanation. + */ + if (rmt->rm_lsn != cpu_to_be64(NULLCOMMITLSN)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; } xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF); @@ -217,10 +222,22 @@ xfs_attr3_rmt_hdr_set( rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC); rmt->rm_offset = cpu_to_be32(offset); rmt->rm_bytes = cpu_to_be32(size); - uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid); rmt->rm_owner = cpu_to_be64(ino); rmt->rm_blkno = cpu_to_be64(bno); + /* + * Remote attribute blocks are written synchronously, so we don't + * have an LSN that we can stamp in them that makes any sense to log + * recovery. To ensure that log recovery handles overwrites of these + * blocks sanely (i.e. once they've been freed and reallocated as some + * other type of metadata) we need to ensure that the LSN has a value + * that tells log recovery to ignore the LSN and overwrite the buffer + * with whatever is in it's log. To do this, we use the magic + * NULLCOMMITLSN to indicate that the LSN is invalid. + */ + rmt->rm_lsn = cpu_to_be64(NULLCOMMITLSN); + return sizeof(struct xfs_attr3_rmt_hdr); } @@ -434,14 +451,21 @@ xfs_attr_rmtval_set( /* * Allocate a single extent, up to the size of the value. + * + * Note that we have to consider this a data allocation as we + * write the remote attribute without logging the contents. + * Hence we must ensure that we aren't using blocks that are on + * the busy list so that we don't overwrite blocks which have + * recently been freed but their transactions are not yet + * committed to disk. If we overwrite the contents of a busy + * extent and then crash then the block may not contain the + * correct metadata after log recovery occurs. */ xfs_bmap_init(args->flist, args->firstblock); nmap = 1; error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, - blkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - args->firstblock, args->total, &map, &nmap, - args->flist); + blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock, + args->total, &map, &nmap, args->flist); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); @@ -594,9 +618,8 @@ xfs_attr_rmtval_remove( xfs_bmap_init(args->flist, args->firstblock); error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - 1, args->firstblock, args->flist, - &done); + XFS_BMAPI_ATTRFORK, 1, args->firstblock, + args->flist, &done); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c index 0e8885a59646..0e8885a59646 100644 --- a/fs/xfs/xfs_bit.c +++ b/fs/xfs/libxfs/xfs_bit.c diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index f1026e86dabc..8e2010d53b07 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1112,7 +1112,6 @@ xfs_bmap_add_attrfork( int committed; /* xaction was committed */ int logflags; /* logging flags */ int error; /* error return value */ - int cancel_flags = 0; ASSERT(XFS_IFORK_Q(ip) == 0); @@ -1124,17 +1123,15 @@ xfs_bmap_add_attrfork( tp->t_flags |= XFS_TRANS_RESERVE; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : XFS_QMOPT_RES_REGBLKS); if (error) goto trans_cancel; - cancel_flags |= XFS_TRANS_ABORT; if (XFS_IFORK_Q(ip)) goto trans_cancel; if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { @@ -1218,14 +1215,14 @@ xfs_bmap_add_attrfork( error = xfs_bmap_finish(&tp, &flist, &committed); if (error) goto bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; bmap_cancel: xfs_bmap_cancel(&flist); trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -3521,7 +3518,8 @@ xfs_bmap_longest_free_extent( } } - longest = xfs_alloc_longest_free_extent(mp, pag); + longest = xfs_alloc_longest_free_extent(mp, pag, + xfs_alloc_min_freelist(mp, pag)); if (*blen < longest) *blen = longest; @@ -4424,7 +4422,15 @@ xfs_bmapi_convert_unwritten( error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, &bma->cur, mval, bma->firstblock, bma->flist, &tmp_logflags); - bma->logflags |= tmp_logflags; + /* + * Log the inode core unconditionally in the unwritten extent conversion + * path because the conversion might not have done so (e.g., if the + * extent count hasn't changed). We need to make sure the inode is dirty + * in the transaction for the sake of fsync(), even if nothing has + * changed, because fsync() will not force the log for this transaction + * unless it sees the inode pinned. + */ + bma->logflags |= tmp_logflags | XFS_ILOG_CORE; if (error) return error; @@ -5918,7 +5924,7 @@ xfs_bmap_split_extent( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -5936,10 +5942,10 @@ xfs_bmap_split_extent( if (error) goto out; - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - + return xfs_trans_commit(tp); out: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 2c44c8e50782..6b0cf6546a82 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -349,7 +349,8 @@ xfs_bmbt_to_bmdr( if (xfs_sb_version_hascrc(&mp->m_sb)) { ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC)); - ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)); + ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, + &mp->m_sb.sb_meta_uuid)); ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL)); } else @@ -647,7 +648,7 @@ xfs_bmbt_verify( case cpu_to_be32(XFS_BMAP_CRC_MAGIC): if (!xfs_sb_version_hascrc(&mp->m_sb)) return false; - if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn) return false; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index c72283dd8d44..f7d7ee7a2607 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -65,7 +65,8 @@ xfs_btree_check_lblock( if (xfs_sb_version_hascrc(&mp->m_sb)) { lblock_ok = lblock_ok && - uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) && + uuid_equal(&block->bb_u.l.bb_uuid, + &mp->m_sb.sb_meta_uuid) && block->bb_u.l.bb_blkno == cpu_to_be64( bp ? bp->b_bn : XFS_BUF_DADDR_NULL); } @@ -115,7 +116,8 @@ xfs_btree_check_sblock( if (xfs_sb_version_hascrc(&mp->m_sb)) { sblock_ok = sblock_ok && - uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) && + uuid_equal(&block->bb_u.s.bb_uuid, + &mp->m_sb.sb_meta_uuid) && block->bb_u.s.bb_blkno == cpu_to_be64( bp ? bp->b_bn : XFS_BUF_DADDR_NULL); } @@ -1000,7 +1002,7 @@ xfs_btree_init_block_int( if (flags & XFS_BTREE_CRC_BLOCKS) { buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); buf->bb_u.l.bb_owner = cpu_to_be64(owner); - uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid); buf->bb_u.l.bb_pad = 0; buf->bb_u.l.bb_lsn = 0; } @@ -1013,7 +1015,7 @@ xfs_btree_init_block_int( if (flags & XFS_BTREE_CRC_BLOCKS) { buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); buf->bb_u.s.bb_owner = cpu_to_be32(__owner); - uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid); buf->bb_u.s.bb_lsn = 0; } } diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 2385f8cd08ab..be43248a5822 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -146,7 +146,7 @@ xfs_da3_node_verify( if (ichdr.magic != XFS_DA3_NODE_MAGIC) return false; - if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) return false; @@ -233,6 +233,7 @@ xfs_da3_node_read_verify( bp->b_ops->verify_read(bp); return; default: + xfs_buf_ioerror(bp, -EFSCORRUPTED); break; } @@ -324,7 +325,7 @@ xfs_da3_node_create( ichdr.magic = XFS_DA3_NODE_MAGIC; hdr3->info.blkno = cpu_to_be64(bp->b_bn); hdr3->info.owner = cpu_to_be64(args->dp->i_ino); - uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid); + uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid); } else { ichdr.magic = XFS_DA_NODE_MAGIC; } @@ -1822,6 +1823,7 @@ xfs_da3_path_shift( struct xfs_da_args *args; struct xfs_da_node_entry *btree; struct xfs_da3_icnode_hdr nodehdr; + struct xfs_buf *bp; xfs_dablk_t blkno = 0; int level; int error; @@ -1866,20 +1868,24 @@ xfs_da3_path_shift( */ for (blk++, level++; level < path->active; blk++, level++) { /* - * Release the old block. - * (if it's dirty, trans won't actually let go) + * Read the next child block into a local buffer. */ - if (release) - xfs_trans_brelse(args->trans, blk->bp); + error = xfs_da3_node_read(args->trans, dp, blkno, -1, &bp, + args->whichfork); + if (error) + return error; /* - * Read the next child block. + * Release the old block (if it's dirty, the trans doesn't + * actually let go) and swap the local buffer into the path + * structure. This ensures failure of the above read doesn't set + * a NULL buffer in an active slot in the path. */ + if (release) + xfs_trans_brelse(args->trans, blk->bp); blk->blkno = blkno; - error = xfs_da3_node_read(args->trans, dp, blkno, -1, - &blk->bp, args->whichfork); - if (error) - return error; + blk->bp = bp; + info = blk->bp->b_addr; ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || @@ -2351,8 +2357,8 @@ xfs_da_shrink_inode( * the last block to the place we want to kill. */ error = xfs_bunmapi(tp, dp, dead_blkno, count, - xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, - 0, args->firstblock, args->flist, &done); + xfs_bmapi_aflag(w), 0, args->firstblock, + args->flist, &done); if (error == -ENOSPC) { if (w != XFS_DATA_FORK) break; diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 74bcbabfa523..b14bbd6bb05f 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -680,8 +680,15 @@ typedef struct xfs_attr_leaf_name_remote { typedef struct xfs_attr_leafblock { xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */ xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */ - xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */ - xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */ + /* + * The rest of the block contains the following structures after the + * leaf entries, growing from the bottom up. The variables are never + * referenced and definining them can actually make gcc optimize away + * accesses to the 'entries' array above index 0 so don't do that. + * + * xfs_attr_leaf_name_local_t namelist; + * xfs_attr_leaf_name_remote_t valuelist; + */ } xfs_attr_leafblock_t; /* diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index a69fb3a1e161..9de401d297e5 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -362,6 +362,7 @@ xfs_dir_lookup( struct xfs_da_args *args; int rval; int v; /* type-checking value */ + int lock_mode; ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_lookup); @@ -387,6 +388,7 @@ xfs_dir_lookup( if (ci_name) args->op_flags |= XFS_DA_OP_CILOOKUP; + lock_mode = xfs_ilock_data_map_shared(dp); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_lookup(args); goto out_check_rval; @@ -419,6 +421,7 @@ out_check_rval: } } out_free: + xfs_iunlock(dp, lock_mode); kmem_free(args); return rval; } @@ -674,25 +677,22 @@ xfs_dir2_shrink_inode( mp = dp->i_mount; tp = args->trans; da = xfs_dir2_db_to_da(args->geo, db); - /* - * Unmap the fsblock(s). - */ - if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, - XFS_BMAPI_METADATA, 0, args->firstblock, args->flist, - &done))) { + + /* Unmap the fsblock(s). */ + error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, 0, 0, + args->firstblock, args->flist, &done); + if (error) { /* - * ENOSPC actually can happen if we're in a removename with - * no space reservation, and the resulting block removal - * would cause a bmap btree split or conversion from extents - * to btree. This can only happen for un-fragmented - * directory blocks, since you need to be punching out - * the middle of an extent. - * In this case we need to leave the block in the file, - * and not binval it. - * So the block has to be in a consistent empty state - * and appropriately logged. - * We don't free up the buffer, the caller can tell it - * hasn't happened since it got an error back. + * ENOSPC actually can happen if we're in a removename with no + * space reservation, and the resulting block removal would + * cause a bmap btree split or conversion from extents to btree. + * This can only happen for un-fragmented directory blocks, + * since you need to be punching out the middle of an extent. + * In this case we need to leave the block in the file, and not + * binval it. So the block has to be in a consistent empty + * state and appropriately logged. We don't free up the buffer, + * the caller can tell it hasn't happened since it got an error + * back. */ return error; } diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 9354e190b82e..4778d1dd511a 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -67,7 +67,7 @@ xfs_dir3_block_verify( if (xfs_sb_version_hascrc(&mp->m_sb)) { if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) return false; - if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return false; @@ -157,7 +157,7 @@ xfs_dir3_block_init( hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); hdr3->blkno = cpu_to_be64(bp->b_bn); hdr3->owner = cpu_to_be64(dp->i_ino); - uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); return; } diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index de1ea16f5748..824131e71bc5 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -220,7 +220,7 @@ xfs_dir3_data_verify( if (xfs_sb_version_hascrc(&mp->m_sb)) { if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) return false; - if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return false; @@ -252,7 +252,8 @@ xfs_dir3_data_reada_verify( return; case cpu_to_be32(XFS_DIR2_DATA_MAGIC): case cpu_to_be32(XFS_DIR3_DATA_MAGIC): - xfs_dir3_data_verify(bp); + bp->b_ops = &xfs_dir3_data_buf_ops; + bp->b_ops->verify_read(bp); return; default: xfs_buf_ioerror(bp, -EFSCORRUPTED); @@ -604,7 +605,7 @@ xfs_dir3_data_init( hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); hdr3->blkno = cpu_to_be64(bp->b_bn); hdr3->owner = cpu_to_be64(dp->i_ino); - uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); } else hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 106119955400..f300240ebb8d 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -160,7 +160,7 @@ xfs_dir3_leaf_verify( if (leaf3->info.hdr.magic != cpu_to_be16(magic3)) return false; - if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) return false; @@ -310,7 +310,7 @@ xfs_dir3_leaf_init( : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); leaf3->info.blkno = cpu_to_be64(bp->b_bn); leaf3->info.owner = cpu_to_be64(owner); - uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid); + uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid); } else { memset(leaf, 0, sizeof(*leaf)); leaf->hdr.info.magic = cpu_to_be16(type); diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 41b80d3d3877..cc28e924545b 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -93,7 +93,7 @@ xfs_dir3_free_verify( if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC)) return false; - if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return false; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return false; @@ -226,7 +226,7 @@ xfs_dir3_free_get_buf( hdr3->hdr.blkno = cpu_to_be64(bp->b_bn); hdr3->hdr.owner = cpu_to_be64(dp->i_ino); - uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid); + uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid); } else hdr.magic = XFS_DIR2_FREE_MAGIC; dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr); @@ -1845,8 +1845,7 @@ xfs_dir2_node_addname_int( if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) { xfs_alert(mp, - "%s: dir ino %llu needed freesp block %lld for\n" - " data block %lld, got %lld ifbno %llu lastfbno %d", +"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld ifbno %llu lastfbno %d", __func__, (unsigned long long)dp->i_ino, (long long)dp->d_ops->db_to_fdb( args->geo, dbno), @@ -2132,6 +2131,7 @@ xfs_dir2_node_replace( int error; /* error return value */ int i; /* btree level */ xfs_ino_t inum; /* new inode number */ + int ftype; /* new file type */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry being changed */ int rval; /* internal return value */ @@ -2145,7 +2145,14 @@ xfs_dir2_node_replace( state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; + + /* + * We have to save new inode number and ftype since + * xfs_da3_node_lookup_int() is going to overwrite them + */ inum = args->inumber; + ftype = args->filetype; + /* * Lookup the entry to change in the btree. */ @@ -2183,7 +2190,7 @@ xfs_dir2_node_replace( * Fill in the new inode number and log the entry. */ dep->inumber = cpu_to_be64(inum); - args->dp->d_ops->data_put_ftype(dep, args->filetype); + args->dp->d_ops->data_put_ftype(dep, ftype); xfs_dir2_data_log_entry(args, state->extrablk.bp, dep); rval = 0; } diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 6fbf2d853a54..5331b7f0460c 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -163,7 +163,7 @@ xfs_dqcheck( d->dd_diskdq.d_id = cpu_to_be32(id); if (xfs_sb_version_hascrc(&mp->m_sb)) { - uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid); xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } @@ -198,7 +198,7 @@ xfs_dquot_buf_verify_crc( if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF)) return false; - if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_meta_uuid)) return false; } return true; diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 4daaa662337b..9590a069e556 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -100,7 +100,7 @@ typedef struct xfs_sb { xfs_rfsblock_t sb_dblocks; /* number of data blocks */ xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */ xfs_rtblock_t sb_rextents; /* number of realtime extents */ - uuid_t sb_uuid; /* file system unique id */ + uuid_t sb_uuid; /* user-visible file system unique id */ xfs_fsblock_t sb_logstart; /* starting block of log if internal */ xfs_ino_t sb_rootino; /* root inode number */ xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */ @@ -170,10 +170,11 @@ typedef struct xfs_sb { __uint32_t sb_features_log_incompat; __uint32_t sb_crc; /* superblock crc */ - __uint32_t sb_pad; + xfs_extlen_t sb_spino_align; /* sparse inode chunk alignment */ xfs_ino_t sb_pquotino; /* project quota inode */ xfs_lsn_t sb_lsn; /* last write sequence */ + uuid_t sb_meta_uuid; /* metadata file system unique id */ /* must be padded to 64 bit alignment */ } xfs_sb_t; @@ -190,7 +191,7 @@ typedef struct xfs_dsb { __be64 sb_dblocks; /* number of data blocks */ __be64 sb_rblocks; /* number of realtime blocks */ __be64 sb_rextents; /* number of realtime extents */ - uuid_t sb_uuid; /* file system unique id */ + uuid_t sb_uuid; /* user-visible file system unique id */ __be64 sb_logstart; /* starting block of log if internal */ __be64 sb_rootino; /* root inode number */ __be64 sb_rbmino; /* bitmap inode for realtime extents */ @@ -256,10 +257,11 @@ typedef struct xfs_dsb { __be32 sb_features_log_incompat; __le32 sb_crc; /* superblock crc */ - __be32 sb_pad; + __be32 sb_spino_align; /* sparse inode chunk alignment */ __be64 sb_pquotino; /* project quota inode */ __be64 sb_lsn; /* last write sequence */ + uuid_t sb_meta_uuid; /* metadata file system unique id */ /* must be padded to 64 bit alignment */ } xfs_dsb_t; @@ -457,8 +459,12 @@ xfs_sb_has_ro_compat_feature( } #define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ +#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ +#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */ #define XFS_SB_FEAT_INCOMPAT_ALL \ - (XFS_SB_FEAT_INCOMPAT_FTYPE) + (XFS_SB_FEAT_INCOMPAT_FTYPE| \ + XFS_SB_FEAT_INCOMPAT_SPINODES| \ + XFS_SB_FEAT_INCOMPAT_META_UUID) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -506,6 +512,24 @@ static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp) (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); } +static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES); +} + +/* + * XFS_SB_FEAT_INCOMPAT_META_UUID indicates that the metadata UUID + * is stored separately from the user-visible UUID; this allows the + * user-visible UUID to be changed on V5 filesystems which have a + * filesystem UUID stamped into every piece of metadata. + */ +static inline bool xfs_sb_version_hasmetauuid(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID); +} + /* * end of superblock version macros */ @@ -758,19 +782,6 @@ typedef struct xfs_agfl { #define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) - -#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) -#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ - (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp))) -#define XFS_MIN_FREELIST(a,mp) \ - (XFS_MIN_FREELIST_RAW( \ - be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \ - be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp)) -#define XFS_MIN_FREELIST_PAG(pag,mp) \ - (XFS_MIN_FREELIST_RAW( \ - (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ - (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)) - #define XFS_AGB_TO_FSB(mp,agno,agbno) \ (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno)) #define XFS_FSB_TO_AGNO(mp,fsbno) \ @@ -1216,26 +1227,54 @@ typedef __uint64_t xfs_inofree_t; #define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) #define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) +#define XFS_INOBT_HOLEMASK_FULL 0 /* holemask for full chunk */ +#define XFS_INOBT_HOLEMASK_BITS (NBBY * sizeof(__uint16_t)) +#define XFS_INODES_PER_HOLEMASK_BIT \ + (XFS_INODES_PER_CHUNK / (NBBY * sizeof(__uint16_t))) + static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) { return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i; } /* - * Data record structure + * The on-disk inode record structure has two formats. The original "full" + * format uses a 4-byte freecount. The "sparse" format uses a 1-byte freecount + * and replaces the 3 high-order freecount bytes wth the holemask and inode + * count. + * + * The holemask of the sparse record format allows an inode chunk to have holes + * that refer to blocks not owned by the inode record. This facilitates inode + * allocation in the event of severe free space fragmentation. */ typedef struct xfs_inobt_rec { __be32 ir_startino; /* starting inode number */ - __be32 ir_freecount; /* count of free inodes (set bits) */ + union { + struct { + __be32 ir_freecount; /* count of free inodes */ + } f; + struct { + __be16 ir_holemask;/* hole mask for sparse chunks */ + __u8 ir_count; /* total inode count */ + __u8 ir_freecount; /* count of free inodes */ + } sp; + } ir_u; __be64 ir_free; /* free inode mask */ } xfs_inobt_rec_t; typedef struct xfs_inobt_rec_incore { xfs_agino_t ir_startino; /* starting inode number */ - __int32_t ir_freecount; /* count of free inodes (set bits) */ + __uint16_t ir_holemask; /* hole mask for sparse chunks */ + __uint8_t ir_count; /* total inode count */ + __uint8_t ir_freecount; /* count of free inodes (set bits) */ xfs_inofree_t ir_free; /* free inode mask */ } xfs_inobt_rec_incore_t; +static inline bool xfs_inobt_issparse(uint16_t holemask) +{ + /* non-zero holemask represents a sparse rec. */ + return holemask; +} /* * Key structure @@ -1453,8 +1492,8 @@ struct xfs_acl { sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp))) /* On-disk XFS extended attribute names */ -#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" -#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" +#define SGI_ACL_FILE "SGI_ACL_FILE" +#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT" #define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) #define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 18dc721ca19f..89689c6a43e2 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -239,6 +239,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ #define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ #define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ +#define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */ /* * Minimum and maximum sizes need for growth checks. diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 1c9e75521250..54deb2d12ac6 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -65,6 +65,8 @@ xfs_inobt_lookup( int *stat) /* success/failure */ { cur->bc_rec.i.ir_startino = ino; + cur->bc_rec.i.ir_holemask = 0; + cur->bc_rec.i.ir_count = 0; cur->bc_rec.i.ir_freecount = 0; cur->bc_rec.i.ir_free = 0; return xfs_btree_lookup(cur, dir, stat); @@ -82,7 +84,14 @@ xfs_inobt_update( union xfs_btree_rec rec; rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino); - rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount); + if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask); + rec.inobt.ir_u.sp.ir_count = irec->ir_count; + rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount; + } else { + /* ir_holemask/ir_count not supported on-disk */ + rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount); + } rec.inobt.ir_free = cpu_to_be64(irec->ir_free); return xfs_btree_update(cur, &rec); } @@ -100,12 +109,27 @@ xfs_inobt_get_rec( int error; error = xfs_btree_get_rec(cur, &rec, stat); - if (!error && *stat == 1) { - irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); - irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount); - irec->ir_free = be64_to_cpu(rec->inobt.ir_free); + if (error || *stat == 0) + return error; + + irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); + if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask); + irec->ir_count = rec->inobt.ir_u.sp.ir_count; + irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount; + } else { + /* + * ir_holemask/ir_count not supported on-disk. Fill in hardcoded + * values for full inode chunks. + */ + irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL; + irec->ir_count = XFS_INODES_PER_CHUNK; + irec->ir_freecount = + be32_to_cpu(rec->inobt.ir_u.f.ir_freecount); } - return error; + irec->ir_free = be64_to_cpu(rec->inobt.ir_free); + + return 0; } /* @@ -114,10 +138,14 @@ xfs_inobt_get_rec( STATIC int xfs_inobt_insert_rec( struct xfs_btree_cur *cur, + __uint16_t holemask, + __uint8_t count, __int32_t freecount, xfs_inofree_t free, int *stat) { + cur->bc_rec.i.ir_holemask = holemask; + cur->bc_rec.i.ir_count = count; cur->bc_rec.i.ir_freecount = freecount; cur->bc_rec.i.ir_free = free; return xfs_btree_insert(cur, stat); @@ -154,7 +182,9 @@ xfs_inobt_insert( } ASSERT(i == 0); - error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK, + error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL, + XFS_INODES_PER_CHUNK, + XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i); if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); @@ -220,6 +250,7 @@ xfs_ialloc_inode_init( struct xfs_mount *mp, struct xfs_trans *tp, struct list_head *buffer_list, + int icount, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, @@ -275,7 +306,7 @@ xfs_ialloc_inode_init( * they track in the AIL as if they were physically logged. */ if (tp) - xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos, + xfs_icreate_log(tp, agno, agbno, icount, mp->m_sb.sb_inodesize, length, gen); } else version = 2; @@ -307,7 +338,8 @@ xfs_ialloc_inode_init( if (version == 3) { free->di_ino = cpu_to_be64(ino); ino++; - uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&free->di_uuid, + &mp->m_sb.sb_meta_uuid); xfs_dinode_calc_crc(mp, free); } else if (tp) { /* just log the inode core */ @@ -347,6 +379,214 @@ xfs_ialloc_inode_init( } /* + * Align startino and allocmask for a recently allocated sparse chunk such that + * they are fit for insertion (or merge) into the on-disk inode btrees. + * + * Background: + * + * When enabled, sparse inode support increases the inode alignment from cluster + * size to inode chunk size. This means that the minimum range between two + * non-adjacent inode records in the inobt is large enough for a full inode + * record. This allows for cluster sized, cluster aligned block allocation + * without need to worry about whether the resulting inode record overlaps with + * another record in the tree. Without this basic rule, we would have to deal + * with the consequences of overlap by potentially undoing recent allocations in + * the inode allocation codepath. + * + * Because of this alignment rule (which is enforced on mount), there are two + * inobt possibilities for newly allocated sparse chunks. One is that the + * aligned inode record for the chunk covers a range of inodes not already + * covered in the inobt (i.e., it is safe to insert a new sparse record). The + * other is that a record already exists at the aligned startino that considers + * the newly allocated range as sparse. In the latter case, record content is + * merged in hope that sparse inode chunks fill to full chunks over time. + */ +STATIC void +xfs_align_sparse_ino( + struct xfs_mount *mp, + xfs_agino_t *startino, + uint16_t *allocmask) +{ + xfs_agblock_t agbno; + xfs_agblock_t mod; + int offset; + + agbno = XFS_AGINO_TO_AGBNO(mp, *startino); + mod = agbno % mp->m_sb.sb_inoalignmt; + if (!mod) + return; + + /* calculate the inode offset and align startino */ + offset = mod << mp->m_sb.sb_inopblog; + *startino -= offset; + + /* + * Since startino has been aligned down, left shift allocmask such that + * it continues to represent the same physical inodes relative to the + * new startino. + */ + *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT; +} + +/* + * Determine whether the source inode record can merge into the target. Both + * records must be sparse, the inode ranges must match and there must be no + * allocation overlap between the records. + */ +STATIC bool +__xfs_inobt_can_merge( + struct xfs_inobt_rec_incore *trec, /* tgt record */ + struct xfs_inobt_rec_incore *srec) /* src record */ +{ + uint64_t talloc; + uint64_t salloc; + + /* records must cover the same inode range */ + if (trec->ir_startino != srec->ir_startino) + return false; + + /* both records must be sparse */ + if (!xfs_inobt_issparse(trec->ir_holemask) || + !xfs_inobt_issparse(srec->ir_holemask)) + return false; + + /* both records must track some inodes */ + if (!trec->ir_count || !srec->ir_count) + return false; + + /* can't exceed capacity of a full record */ + if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK) + return false; + + /* verify there is no allocation overlap */ + talloc = xfs_inobt_irec_to_allocmask(trec); + salloc = xfs_inobt_irec_to_allocmask(srec); + if (talloc & salloc) + return false; + + return true; +} + +/* + * Merge the source inode record into the target. The caller must call + * __xfs_inobt_can_merge() to ensure the merge is valid. + */ +STATIC void +__xfs_inobt_rec_merge( + struct xfs_inobt_rec_incore *trec, /* target */ + struct xfs_inobt_rec_incore *srec) /* src */ +{ + ASSERT(trec->ir_startino == srec->ir_startino); + + /* combine the counts */ + trec->ir_count += srec->ir_count; + trec->ir_freecount += srec->ir_freecount; + + /* + * Merge the holemask and free mask. For both fields, 0 bits refer to + * allocated inodes. We combine the allocated ranges with bitwise AND. + */ + trec->ir_holemask &= srec->ir_holemask; + trec->ir_free &= srec->ir_free; +} + +/* + * Insert a new sparse inode chunk into the associated inode btree. The inode + * record for the sparse chunk is pre-aligned to a startino that should match + * any pre-existing sparse inode record in the tree. This allows sparse chunks + * to fill over time. + * + * This function supports two modes of handling preexisting records depending on + * the merge flag. If merge is true, the provided record is merged with the + * existing record and updated in place. The merged record is returned in nrec. + * If merge is false, an existing record is replaced with the provided record. + * If no preexisting record exists, the provided record is always inserted. + * + * It is considered corruption if a merge is requested and not possible. Given + * the sparse inode alignment constraints, this should never happen. + */ +STATIC int +xfs_inobt_insert_sprec( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + int btnum, + struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */ + bool merge) /* merge or replace */ +{ + struct xfs_btree_cur *cur; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + int error; + int i; + struct xfs_inobt_rec_incore rec; + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + + /* the new record is pre-aligned so we know where to look */ + error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + goto error; + /* if nothing there, insert a new record and return */ + if (i == 0) { + error = xfs_inobt_insert_rec(cur, nrec->ir_holemask, + nrec->ir_count, nrec->ir_freecount, + nrec->ir_free, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); + + goto out; + } + + /* + * A record exists at this startino. Merge or replace the record + * depending on what we've been asked to do. + */ + if (merge) { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); + XFS_WANT_CORRUPTED_GOTO(mp, + rec.ir_startino == nrec->ir_startino, + error); + + /* + * This should never fail. If we have coexisting records that + * cannot merge, something is seriously wrong. + */ + XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec), + error); + + trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino, + rec.ir_holemask, nrec->ir_startino, + nrec->ir_holemask); + + /* merge to nrec to output the updated record */ + __xfs_inobt_rec_merge(nrec, &rec); + + trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino, + nrec->ir_holemask); + + error = xfs_inobt_rec_check_count(mp, nrec); + if (error) + goto error; + } + + error = xfs_inobt_update(cur, nrec); + if (error) + goto error; + +out: + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; +error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* * Allocate new inodes in the allocation group specified by agbp. * Return 0 for success, else error code. */ @@ -364,11 +604,22 @@ xfs_ialloc_ag_alloc( xfs_agino_t newlen; /* new number of inodes */ int isaligned = 0; /* inode allocation at stripe unit */ /* boundary */ + uint16_t allocmask = (uint16_t) -1; /* init. to full chunk */ + struct xfs_inobt_rec_incore rec; struct xfs_perag *pag; + int do_sparse = 0; memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = tp->t_mountp; + args.fsbno = NULLFSBLOCK; + +#ifdef DEBUG + /* randomly do sparse inode allocations */ + if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) && + args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks) + do_sparse = prandom_u32() & 1; +#endif /* * Locking will ensure that we don't have two callers in here @@ -390,6 +641,8 @@ xfs_ialloc_ag_alloc( agno = be32_to_cpu(agi->agi_seqno); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + args.mp->m_ialloc_blks; + if (do_sparse) + goto sparse_alloc; if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); @@ -428,8 +681,7 @@ xfs_ialloc_ag_alloc( * subsequent requests. */ args.minalignslop = 0; - } else - args.fsbno = NULLFSBLOCK; + } if (unlikely(args.fsbno == NULLFSBLOCK)) { /* @@ -480,6 +732,47 @@ xfs_ialloc_ag_alloc( return error; } + /* + * Finally, try a sparse allocation if the filesystem supports it and + * the sparse allocation length is smaller than a full chunk. + */ + if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) && + args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks && + args.fsbno == NULLFSBLOCK) { +sparse_alloc: + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.agbno = be32_to_cpu(agi->agi_root); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.alignment = args.mp->m_sb.sb_spino_align; + args.prod = 1; + + args.minlen = args.mp->m_ialloc_min_blks; + args.maxlen = args.minlen; + + /* + * The inode record will be aligned to full chunk size. We must + * prevent sparse allocation from AG boundaries that result in + * invalid inode records, such as records that start at agbno 0 + * or extend beyond the AG. + * + * Set min agbno to the first aligned, non-zero agbno and max to + * the last aligned agbno that is at least one full chunk from + * the end of the AG. + */ + args.min_agbno = args.mp->m_sb.sb_inoalignmt; + args.max_agbno = round_down(args.mp->m_sb.sb_agblocks, + args.mp->m_sb.sb_inoalignmt) - + args.mp->m_ialloc_blks; + + error = xfs_alloc_vextent(&args); + if (error) + return error; + + newlen = args.len << args.mp->m_sb.sb_inopblog; + ASSERT(newlen <= XFS_INODES_PER_CHUNK); + allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1; + } + if (args.fsbno == NULLFSBLOCK) { *alloc = 0; return 0; @@ -495,8 +788,8 @@ xfs_ialloc_ag_alloc( * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno, - args.len, prandom_u32()); + error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno, + args.agbno, args.len, prandom_u32()); if (error) return error; @@ -504,6 +797,73 @@ xfs_ialloc_ag_alloc( * Convert the results. */ newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); + + if (xfs_inobt_issparse(~allocmask)) { + /* + * We've allocated a sparse chunk. Align the startino and mask. + */ + xfs_align_sparse_ino(args.mp, &newino, &allocmask); + + rec.ir_startino = newino; + rec.ir_holemask = ~allocmask; + rec.ir_count = newlen; + rec.ir_freecount = newlen; + rec.ir_free = XFS_INOBT_ALL_FREE; + + /* + * Insert the sparse record into the inobt and allow for a merge + * if necessary. If a merge does occur, rec is updated to the + * merged record. + */ + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO, + &rec, true); + if (error == -EFSCORRUPTED) { + xfs_alert(args.mp, + "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", + XFS_AGINO_TO_INO(args.mp, agno, + rec.ir_startino), + rec.ir_holemask, rec.ir_count); + xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE); + } + if (error) + return error; + + /* + * We can't merge the part we've just allocated as for the inobt + * due to finobt semantics. The original record may or may not + * exist independent of whether physical inodes exist in this + * sparse chunk. + * + * We must update the finobt record based on the inobt record. + * rec contains the fully merged and up to date inobt record + * from the previous call. Set merge false to replace any + * existing record with this one. + */ + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, + XFS_BTNUM_FINO, &rec, + false); + if (error) + return error; + } + } else { + /* full chunk - insert new records to both btrees */ + error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + XFS_BTNUM_INO); + if (error) + return error; + + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_insert(args.mp, tp, agbp, newino, + newlen, XFS_BTNUM_FINO); + if (error) + return error; + } + } + + /* + * Update AGI counts and newino. + */ be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); pag = xfs_perag_get(args.mp, agno); @@ -512,20 +872,6 @@ xfs_ialloc_ag_alloc( agi->agi_newino = cpu_to_be32(newino); /* - * Insert records describing the new inode chunk into the btrees. - */ - error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, - XFS_BTNUM_INO); - if (error) - return error; - - if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { - error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, - XFS_BTNUM_FINO); - if (error) - return error; - } - /* * Log allocation group header fields */ xfs_ialloc_log_agi(tp, agbp, @@ -645,7 +991,7 @@ xfs_ialloc_ag_select( * if we fail allocation due to alignment issues then it is most * likely a real ENOSPC condition. */ - ineed = mp->m_ialloc_blks; + ineed = mp->m_ialloc_min_blks; if (flags && ineed > 1) ineed += xfs_ialloc_cluster_alignment(mp); longest = pag->pagf_longest; @@ -732,6 +1078,27 @@ xfs_ialloc_get_rec( } /* + * Return the offset of the first free inode in the record. If the inode chunk + * is sparsely allocated, we convert the record holemask to inode granularity + * and mask off the unallocated regions from the inode free mask. + */ +STATIC int +xfs_inobt_first_free_inode( + struct xfs_inobt_rec_incore *rec) +{ + xfs_inofree_t realfree; + + /* if there are no holes, return the first available offset */ + if (!xfs_inobt_issparse(rec->ir_holemask)) + return xfs_lowbit64(rec->ir_free); + + realfree = xfs_inobt_irec_to_allocmask(rec); + realfree &= rec->ir_free; + + return xfs_lowbit64(realfree); +} + +/* * Allocate an inode using the inobt-only algorithm. */ STATIC int @@ -961,7 +1328,7 @@ newino: } alloc_inode: - offset = xfs_lowbit64(rec.ir_free); + offset = xfs_inobt_first_free_inode(&rec); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % @@ -1210,7 +1577,7 @@ xfs_dialloc_ag( if (error) goto error_cur; - offset = xfs_lowbit64(rec.ir_free); + offset = xfs_inobt_first_free_inode(&rec); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % @@ -1439,6 +1806,83 @@ out_error: return error; } +/* + * Free the blocks of an inode chunk. We must consider that the inode chunk + * might be sparse and only free the regions that are allocated as part of the + * chunk. + */ +STATIC void +xfs_difree_inode_chunk( + struct xfs_mount *mp, + xfs_agnumber_t agno, + struct xfs_inobt_rec_incore *rec, + struct xfs_bmap_free *flist) +{ + xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino); + int startidx, endidx; + int nextbit; + xfs_agblock_t agbno; + int contigblk; + DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS); + + if (!xfs_inobt_issparse(rec->ir_holemask)) { + /* not sparse, calculate extent info directly */ + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, + XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)), + mp->m_ialloc_blks, flist, mp); + return; + } + + /* holemask is only 16-bits (fits in an unsigned long) */ + ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0])); + holemask[0] = rec->ir_holemask; + + /* + * Find contiguous ranges of zeroes (i.e., allocated regions) in the + * holemask and convert the start/end index of each range to an extent. + * We start with the start and end index both pointing at the first 0 in + * the mask. + */ + startidx = endidx = find_first_zero_bit(holemask, + XFS_INOBT_HOLEMASK_BITS); + nextbit = startidx + 1; + while (startidx < XFS_INOBT_HOLEMASK_BITS) { + nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS, + nextbit); + /* + * If the next zero bit is contiguous, update the end index of + * the current range and continue. + */ + if (nextbit != XFS_INOBT_HOLEMASK_BITS && + nextbit == endidx + 1) { + endidx = nextbit; + goto next; + } + + /* + * nextbit is not contiguous with the current end index. Convert + * the current start/end to an extent and add it to the free + * list. + */ + agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) / + mp->m_sb.sb_inopblock; + contigblk = ((endidx - startidx + 1) * + XFS_INODES_PER_HOLEMASK_BIT) / + mp->m_sb.sb_inopblock; + + ASSERT(agbno % mp->m_sb.sb_spino_align == 0); + ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, + flist, mp); + + /* reset range to current bit and carry on... */ + startidx = endidx = nextbit; + +next: + nextbit++; + } +} + STATIC int xfs_difree_inobt( struct xfs_mount *mp, @@ -1446,8 +1890,7 @@ xfs_difree_inobt( struct xfs_buf *agbp, xfs_agino_t agino, struct xfs_bmap_free *flist, - int *deleted, - xfs_ino_t *first_ino, + struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); @@ -1501,20 +1944,23 @@ xfs_difree_inobt( rec.ir_freecount++; /* - * When an inode cluster is free, it becomes eligible for removal + * When an inode chunk is free, it becomes eligible for removal. Don't + * remove the chunk if the block size is large enough for multiple inode + * chunks (that might not be free). */ if (!(mp->m_flags & XFS_MOUNT_IKEEP) && - (rec.ir_freecount == mp->m_ialloc_inos)) { - - *deleted = 1; - *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); + rec.ir_free == XFS_INOBT_ALL_FREE && + mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { + xic->deleted = 1; + xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); + xic->alloc = xfs_inobt_irec_to_allocmask(&rec); /* * Remove the inode cluster from the AGI B+Tree, adjust the * AGI and Superblock inode counts, and mark the disk space * to be freed when the transaction is committed. */ - ilen = mp->m_ialloc_inos; + ilen = rec.ir_freecount; be32_add_cpu(&agi->agi_count, -ilen); be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); @@ -1530,11 +1976,9 @@ xfs_difree_inobt( goto error0; } - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, - XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)), - mp->m_ialloc_blks, flist, mp); + xfs_difree_inode_chunk(mp, agno, &rec, flist); } else { - *deleted = 0; + xic->deleted = 0; error = xfs_inobt_update(cur, &rec); if (error) { @@ -1599,7 +2043,9 @@ xfs_difree_finobt( */ XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error); - error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount, + error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask, + ibtrec->ir_count, + ibtrec->ir_freecount, ibtrec->ir_free, &i); if (error) goto error; @@ -1634,8 +2080,13 @@ xfs_difree_finobt( * free inode. Hence, if all of the inodes are free and we aren't * keeping inode chunks permanently on disk, remove the record. * Otherwise, update the record with the new information. + * + * Note that we currently can't free chunks when the block size is large + * enough for multiple chunks. Leave the finobt record to remain in sync + * with the inobt. */ - if (rec.ir_freecount == mp->m_ialloc_inos && + if (rec.ir_free == XFS_INOBT_ALL_FREE && + mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK && !(mp->m_flags & XFS_MOUNT_IKEEP)) { error = xfs_btree_delete(cur, &i); if (error) @@ -1671,8 +2122,7 @@ xfs_difree( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ struct xfs_bmap_free *flist, /* extents to free */ - int *deleted,/* set if inode cluster was deleted */ - xfs_ino_t *first_ino)/* first inode in deleted cluster */ + struct xfs_icluster *xic) /* cluster info if deleted */ { /* REFERENCED */ xfs_agblock_t agbno; /* block number containing inode */ @@ -1723,8 +2173,7 @@ xfs_difree( /* * Fix up the inode allocation btree. */ - error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino, - &rec); + error = xfs_difree_inobt(mp, tp, agbp, agino, flist, xic, &rec); if (error) goto error0; @@ -1784,7 +2233,7 @@ xfs_imap_lookup( } xfs_trans_brelse(tp, agbp); - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); if (error) return error; @@ -2052,7 +2501,7 @@ xfs_agi_verify( struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); if (xfs_sb_version_hascrc(&mp->m_sb) && - !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid)) + !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) return false; /* * Validate the magic number of the agi block. diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 100007d56449..6e450df2979b 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -28,6 +28,13 @@ struct xfs_btree_cur; /* Move inodes in clusters of this size */ #define XFS_INODE_BIG_CLUSTER_SIZE 8192 +struct xfs_icluster { + bool deleted; /* record is deleted */ + xfs_ino_t first_ino; /* first inode number */ + uint64_t alloc; /* inode phys. allocation bitmap for + * sparse chunks */ +}; + /* Calculate and return the number of filesystem blocks per inode cluster */ static inline int xfs_icluster_size_fsb( @@ -44,8 +51,7 @@ xfs_icluster_size_fsb( static inline struct xfs_dinode * xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) { - return (struct xfs_dinode *) - (xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog)); + return xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog); } /* @@ -90,8 +96,7 @@ xfs_difree( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ struct xfs_bmap_free *flist, /* extents to free */ - int *deleted, /* set if inode cluster was deleted */ - xfs_ino_t *first_ino); /* first inode in deleted cluster */ + struct xfs_icluster *ifree); /* cluster info if deleted */ /* * Return the location of the inode in imap, for mapping it into a buffer. @@ -156,7 +161,7 @@ int xfs_inobt_get_rec(struct xfs_btree_cur *cur, * Inode chunk initialisation routine */ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, - struct list_head *buffer_list, + struct list_head *buffer_list, int icount, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, unsigned int gen); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 964c465ca69c..f39b285beb19 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -167,7 +167,16 @@ xfs_inobt_init_rec_from_cur( union xfs_btree_rec *rec) { rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino); - rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount); + if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + rec->inobt.ir_u.sp.ir_holemask = + cpu_to_be16(cur->bc_rec.i.ir_holemask); + rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count; + rec->inobt.ir_u.sp.ir_freecount = cur->bc_rec.i.ir_freecount; + } else { + /* ir_holemask/ir_count not supported on-disk */ + rec->inobt.ir_u.f.ir_freecount = + cpu_to_be32(cur->bc_rec.i.ir_freecount); + } rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free); } @@ -230,7 +239,7 @@ xfs_inobt_verify( case cpu_to_be32(XFS_FIBT_CRC_MAGIC): if (!xfs_sb_version_hascrc(&mp->m_sb)) return false; - if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) return false; @@ -418,3 +427,85 @@ xfs_inobt_maxrecs( return blocklen / sizeof(xfs_inobt_rec_t); return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t)); } + +/* + * Convert the inode record holemask to an inode allocation bitmap. The inode + * allocation bitmap is inode granularity and specifies whether an inode is + * physically allocated on disk (not whether the inode is considered allocated + * or free by the fs). + * + * A bit value of 1 means the inode is allocated, a value of 0 means it is free. + */ +uint64_t +xfs_inobt_irec_to_allocmask( + struct xfs_inobt_rec_incore *rec) +{ + uint64_t bitmap = 0; + uint64_t inodespbit; + int nextbit; + uint allocbitmap; + + /* + * The holemask has 16-bits for a 64 inode record. Therefore each + * holemask bit represents multiple inodes. Create a mask of bits to set + * in the allocmask for each holemask bit. + */ + inodespbit = (1 << XFS_INODES_PER_HOLEMASK_BIT) - 1; + + /* + * Allocated inodes are represented by 0 bits in holemask. Invert the 0 + * bits to 1 and convert to a uint so we can use xfs_next_bit(). Mask + * anything beyond the 16 holemask bits since this casts to a larger + * type. + */ + allocbitmap = ~rec->ir_holemask & ((1 << XFS_INOBT_HOLEMASK_BITS) - 1); + + /* + * allocbitmap is the inverted holemask so every set bit represents + * allocated inodes. To expand from 16-bit holemask granularity to + * 64-bit (e.g., bit-per-inode), set inodespbit bits in the target + * bitmap for every holemask bit. + */ + nextbit = xfs_next_bit(&allocbitmap, 1, 0); + while (nextbit != -1) { + ASSERT(nextbit < (sizeof(rec->ir_holemask) * NBBY)); + + bitmap |= (inodespbit << + (nextbit * XFS_INODES_PER_HOLEMASK_BIT)); + + nextbit = xfs_next_bit(&allocbitmap, 1, nextbit + 1); + } + + return bitmap; +} + +#if defined(DEBUG) || defined(XFS_WARN) +/* + * Verify that an in-core inode record has a valid inode count. + */ +int +xfs_inobt_rec_check_count( + struct xfs_mount *mp, + struct xfs_inobt_rec_incore *rec) +{ + int inocount = 0; + int nextbit = 0; + uint64_t allocbmap; + int wordsz; + + wordsz = sizeof(allocbmap) / sizeof(unsigned int); + allocbmap = xfs_inobt_irec_to_allocmask(rec); + + nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit); + while (nextbit != -1) { + inocount++; + nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, + nextbit + 1); + } + + if (inocount != rec->ir_count) + return -EFSCORRUPTED; + + return 0; +} +#endif /* DEBUG */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index d7ebea72c2d0..bd88453217ce 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -62,4 +62,14 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, xfs_btnum_t); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); +/* ir_holemask to inode allocation bitmap conversion */ +uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *); + +#if defined(DEBUG) || defined(XFS_WARN) +int xfs_inobt_rec_check_count(struct xfs_mount *, + struct xfs_inobt_rec_incore *); +#else +#define xfs_inobt_rec_check_count(mp, rec) 0 +#endif /* DEBUG */ + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 002b6b3a1988..268c00f4f83a 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -46,8 +46,7 @@ xfs_inobp_check( j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; for (i = 0; i < j; i++) { - dip = (xfs_dinode_t *)xfs_buf_offset(bp, - i * mp->m_sb.sb_inodesize); + dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize); if (!dip->di_next_unlinked) { xfs_alert(mp, "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.", @@ -86,8 +85,7 @@ xfs_inode_buf_verify( int di_ok; xfs_dinode_t *dip; - dip = (struct xfs_dinode *)xfs_buf_offset(bp, - (i << mp->m_sb.sb_inodelog)); + dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && XFS_DINODE_GOOD_VERSION(dip->di_version); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, @@ -186,7 +184,7 @@ xfs_imap_to_bp( } *bpp = bp; - *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); + *dipp = xfs_buf_offset(bp, imap->im_boffset); return 0; } @@ -306,7 +304,7 @@ xfs_dinode_verify( return false; if (be64_to_cpu(dip->di_ino) != ip->i_ino) return false; - if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid)) return false; return true; } @@ -368,7 +366,7 @@ xfs_iread( if (xfs_sb_version_hascrc(&mp->m_sb)) { ip->i_d.di_version = 3; ip->i_d.di_ino = ip->i_ino; - uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid); } else ip->i_d.di_version = 2; return 0; diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index dc4bfc5d88fc..47425140f343 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -131,10 +131,11 @@ xfs_mount_validate_sb( if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) { xfs_warn(mp, -"Superblock has unknown compatible features (0x%x) enabled.\n" -"Using a more recent kernel is recommended.", +"Superblock has unknown compatible features (0x%x) enabled.", (sbp->sb_features_compat & XFS_SB_FEAT_COMPAT_UNKNOWN)); + xfs_warn(mp, +"Using a more recent kernel is recommended."); } if (xfs_sb_has_ro_compat_feature(sbp, @@ -145,18 +146,21 @@ xfs_mount_validate_sb( XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { xfs_warn(mp, -"Attempted to mount read-only compatible filesystem read-write.\n" +"Attempted to mount read-only compatible filesystem read-write."); + xfs_warn(mp, "Filesystem can only be safely mounted read only."); + return -EINVAL; } } if (xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_UNKNOWN)) { xfs_warn(mp, -"Superblock has unknown incompatible features (0x%x) enabled.\n" -"Filesystem can not be safely mounted by this kernel.", +"Superblock has unknown incompatible features (0x%x) enabled.", (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_UNKNOWN)); + xfs_warn(mp, +"Filesystem can not be safely mounted by this kernel."); return -EINVAL; } } @@ -174,6 +178,24 @@ xfs_mount_validate_sb( return -EFSCORRUPTED; } + /* + * Full inode chunks must be aligned to inode chunk size when + * sparse inodes are enabled to support the sparse chunk + * allocation algorithm and prevent overlapping inode records. + */ + if (xfs_sb_version_hassparseinodes(sbp)) { + uint32_t align; + + align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize + >> sbp->sb_blocklog; + if (sbp->sb_inoalignmt != align) { + xfs_warn(mp, +"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.", + sbp->sb_inoalignmt, align); + return -EINVAL; + } + } + if (unlikely( sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { xfs_warn(mp, @@ -374,9 +396,17 @@ __xfs_sb_from_disk( be32_to_cpu(from->sb_features_log_incompat); /* crc is only used on disk, not in memory; just init to 0 here. */ to->sb_crc = 0; - to->sb_pad = 0; + to->sb_spino_align = be32_to_cpu(from->sb_spino_align); to->sb_pquotino = be64_to_cpu(from->sb_pquotino); to->sb_lsn = be64_to_cpu(from->sb_lsn); + /* + * sb_meta_uuid is only on disk if it differs from sb_uuid and the + * feature flag is set; if not set we keep it only in memory. + */ + if (xfs_sb_version_hasmetauuid(to)) + uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); + else + uuid_copy(&to->sb_meta_uuid, &from->sb_uuid); /* Convert on-disk flags to in-memory flags? */ if (convert_xquota) xfs_sb_quota_from_disk(to); @@ -516,8 +546,10 @@ xfs_sb_to_disk( cpu_to_be32(from->sb_features_incompat); to->sb_features_log_incompat = cpu_to_be32(from->sb_features_log_incompat); - to->sb_pad = 0; + to->sb_spino_align = cpu_to_be32(from->sb_spino_align); to->sb_lsn = cpu_to_be64(from->sb_lsn); + if (xfs_sb_version_hasmetauuid(from)) + uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); } } @@ -689,6 +721,11 @@ xfs_sb_mount_common( mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, sbp->sb_inopblock); mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; + + if (sbp->sb_spino_align) + mp->m_ialloc_min_blks = sbp->sb_spino_align; + else + mp->m_ialloc_min_blks = mp->m_ialloc_blks; } /* @@ -792,12 +829,12 @@ xfs_sync_sb( tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } xfs_log_sb(tp); if (wait) xfs_trans_set_sync(tp); - return xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp); } diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 8dda4b321343..5be529707903 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -182,12 +182,6 @@ int xfs_log_calc_minimum_size(struct xfs_mount *); #define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer count in superblock */ /* - * Values for call flags parameter. - */ -#define XFS_TRANS_RELEASE_LOG_RES 0x4 -#define XFS_TRANS_ABORT 0x8 - -/* * Field values for xfs_trans_mod_sb. */ #define XFS_TRANS_SB_ICOUNT 0x00000001 diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index e7e26bd6468f..8f8af05b3f13 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -63,7 +63,7 @@ xfs_symlink_hdr_set( dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC); dsl->sl_offset = cpu_to_be32(offset); dsl->sl_bytes = cpu_to_be32(size); - uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid); dsl->sl_owner = cpu_to_be64(ino); dsl->sl_blkno = cpu_to_be64(bp->b_bn); bp->b_ops = &xfs_symlink_buf_ops; @@ -107,7 +107,7 @@ xfs_symlink_verify( return false; if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC)) return false; - if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid)) + if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid)) return false; if (bp->b_bn != be64_to_cpu(dsl->sl_blkno)) return false; diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 2d5bdfce6d8f..797815012c0e 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -73,9 +73,9 @@ struct xfs_trans_resv { * 2 trees * (2 blocks/level * max depth - 1) * block size */ #define XFS_ALLOCFREE_LOG_RES(mp,nx) \ - ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1))) + ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * (mp)->m_ag_maxlevels - 1))) #define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ - ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1))) + ((nx) * (2 * (2 * (mp)->m_ag_maxlevels - 1))) /* * Per-directory log reservation for any directory change. diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index bf9c4579334d..41e0428d8175 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -67,7 +67,7 @@ #define XFS_DIOSTRAT_SPACE_RES(mp, v) \ (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v)) #define XFS_GROWFS_SPACE_RES(mp) \ - (2 * XFS_AG_MAXLEVELS(mp)) + (2 * (mp)->m_ag_maxlevels) #define XFS_GROWFSRT_SPACE_RES(mp,b) \ ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK)) #define XFS_LINK_SPACE_RES(mp,nl) \ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e5099f268032..50ab2879b9da 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -109,7 +109,7 @@ xfs_setfilesize_trans_alloc( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -119,8 +119,7 @@ xfs_setfilesize_trans_alloc( * We may pass freeze protection with a transaction. So tell lockdep * we released it. */ - rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 1, _THIS_IP_); + __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS); /* * We hand off the transaction to the completion thread now, so * clear the flag here. @@ -145,7 +144,7 @@ xfs_setfilesize( isize = xfs_new_eof(ip, offset + size); if (!isize) { xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return 0; } @@ -155,7 +154,7 @@ xfs_setfilesize( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp); } STATIC int @@ -171,8 +170,7 @@ xfs_setfilesize_ioend( * Similarly for freeze protection. */ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); - rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); + __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); } @@ -351,12 +349,12 @@ xfs_imap_valid( */ STATIC void xfs_end_bio( - struct bio *bio, - int error) + struct bio *bio) { xfs_ioend_t *ioend = bio->bi_private; - ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error; + if (!ioend->io_error) + ioend->io_error = bio->bi_error; /* Toss bio and pass work off to an xfsdatad thread */ bio->bi_private = NULL; @@ -382,8 +380,7 @@ STATIC struct bio * xfs_alloc_ioend_bio( struct buffer_head *bh) { - int nvecs = bio_get_nr_vecs(bh->b_bdev); - struct bio *bio = bio_alloc(GFP_NOIO, nvecs); + struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); ASSERT(bio->bi_private == NULL); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); @@ -1348,7 +1345,7 @@ __xfs_get_blocks( sector_t iblock, struct buffer_head *bh_result, int create, - int direct) + bool direct) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1413,6 +1410,7 @@ __xfs_get_blocks( if (error) return error; new = 1; + } else { /* * Delalloc reservations do not require a transaction, @@ -1507,49 +1505,29 @@ xfs_get_blocks( struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, bh_result, create, 0); + return __xfs_get_blocks(inode, iblock, bh_result, create, false); } -STATIC int +int xfs_get_blocks_direct( struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, bh_result, create, 1); + return __xfs_get_blocks(inode, iblock, bh_result, create, true); } -/* - * Complete a direct I/O write request. - * - * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. - * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite - * wholly within the EOF and so there is nothing for us to do. Note that in this - * case the completion can be called in interrupt context, whereas if we have an - * ioend we will always be called in task context (i.e. from a workqueue). - */ -STATIC void -xfs_end_io_direct_write( - struct kiocb *iocb, +static void +__xfs_end_io_direct_write( + struct inode *inode, + struct xfs_ioend *ioend, loff_t offset, - ssize_t size, - void *private) + ssize_t size) { - struct inode *inode = file_inode(iocb->ki_filp); - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - struct xfs_ioend *ioend = private; + struct xfs_mount *mp = XFS_I(inode)->i_mount; - trace_xfs_gbmap_direct_endio(ip, offset, size, - ioend ? ioend->io_type : 0, NULL); - - if (!ioend) { - ASSERT(offset + size <= i_size_read(inode)); - return; - } - - if (XFS_FORCED_SHUTDOWN(mp)) + if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error) goto out_end_io; /* @@ -1586,10 +1564,10 @@ xfs_end_io_direct_write( * here can result in EOF moving backwards and Bad Things Happen when * that occurs. */ - spin_lock(&ip->i_flags_lock); + spin_lock(&XFS_I(inode)->i_flags_lock); if (offset + size > i_size_read(inode)) i_size_write(inode, offset + size); - spin_unlock(&ip->i_flags_lock); + spin_unlock(&XFS_I(inode)->i_flags_lock); /* * If we are doing an append IO that needs to update the EOF on disk, @@ -1606,6 +1584,98 @@ out_end_io: return; } +/* + * Complete a direct I/O write request. + * + * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. + * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite + * wholly within the EOF and so there is nothing for us to do. Note that in this + * case the completion can be called in interrupt context, whereas if we have an + * ioend we will always be called in task context (i.e. from a workqueue). + */ +STATIC void +xfs_end_io_direct_write( + struct kiocb *iocb, + loff_t offset, + ssize_t size, + void *private) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_ioend *ioend = private; + + trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size, + ioend ? ioend->io_type : 0, NULL); + + if (!ioend) { + ASSERT(offset + size <= i_size_read(inode)); + return; + } + + __xfs_end_io_direct_write(inode, ioend, offset, size); +} + +/* + * For DAX we need a mapping buffer callback for unwritten extent conversion + * when page faults allocate blocks and then zero them. Note that in this + * case the mapping indicated by the ioend may extend beyond EOF. We most + * definitely do not want to extend EOF here, so we trim back the ioend size to + * EOF. + */ +#ifdef CONFIG_FS_DAX +void +xfs_end_io_dax_write( + struct buffer_head *bh, + int uptodate) +{ + struct xfs_ioend *ioend = bh->b_private; + struct inode *inode = ioend->io_inode; + ssize_t size = ioend->io_size; + + ASSERT(IS_DAX(ioend->io_inode)); + + /* if there was an error zeroing, then don't convert it */ + if (!uptodate) + ioend->io_error = -EIO; + + /* + * Trim update to EOF, so we don't extend EOF during unwritten extent + * conversion of partial EOF blocks. + */ + spin_lock(&XFS_I(inode)->i_flags_lock); + if (ioend->io_offset + size > i_size_read(inode)) + size = i_size_read(inode) - ioend->io_offset; + spin_unlock(&XFS_I(inode)->i_flags_lock); + + __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size); + +} +#else +void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { } +#endif + +static inline ssize_t +xfs_vm_do_dio( + struct inode *inode, + struct kiocb *iocb, + struct iov_iter *iter, + loff_t offset, + void (*endio)(struct kiocb *iocb, + loff_t offset, + ssize_t size, + void *private), + int flags) +{ + struct block_device *bdev; + + if (IS_DAX(inode)) + return dax_do_io(iocb, inode, iter, offset, + xfs_get_blocks_direct, endio, 0); + + bdev = xfs_find_bdev_for_inode(inode); + return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, + xfs_get_blocks_direct, endio, NULL, flags); +} + STATIC ssize_t xfs_vm_direct_IO( struct kiocb *iocb, @@ -1613,16 +1683,11 @@ xfs_vm_direct_IO( loff_t offset) { struct inode *inode = iocb->ki_filp->f_mapping->host; - struct block_device *bdev = xfs_find_bdev_for_inode(inode); - if (iov_iter_rw(iter) == WRITE) { - return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, - xfs_get_blocks_direct, - xfs_end_io_direct_write, NULL, - DIO_ASYNC_EXTEND); - } - return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, - xfs_get_blocks_direct, NULL, NULL, 0); + if (iov_iter_rw(iter) == WRITE) + return xfs_vm_do_dio(inode, iocb, iter, offset, + xfs_end_io_direct_write, DIO_ASYNC_EXTEND); + return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0); } /* diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index ac644e0137a4..86afd1ac7895 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -53,7 +53,12 @@ typedef struct xfs_ioend { } xfs_ioend_t; extern const struct address_space_operations xfs_address_space_operations; -extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); + +int xfs_get_blocks(struct inode *inode, sector_t offset, + struct buffer_head *map_bh, int create); +int xfs_get_blocks_direct(struct inode *inode, sector_t offset, + struct buffer_head *map_bh, int create); +void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate); extern void xfs_count_page_state(struct page *, int *, int *); diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 3fbf167cfb4c..2bb959ada45b 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -394,7 +394,6 @@ xfs_attr_inactive( { struct xfs_trans *trans; struct xfs_mount *mp; - int cancel_flags = 0; int lock_mode = XFS_ILOCK_SHARED; int error = 0; @@ -423,7 +422,6 @@ xfs_attr_inactive( goto out_cancel; lock_mode = XFS_ILOCK_EXCL; - cancel_flags = XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT; xfs_ilock(dp, lock_mode); if (!XFS_IFORK_Q(dp)) @@ -435,8 +433,14 @@ xfs_attr_inactive( */ xfs_trans_ijoin(trans, dp, 0); - /* invalidate and truncate the attribute fork extents */ - if (dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) { + /* + * Invalidate and truncate the attribute fork extents. Make sure the + * fork actually has attributes as otherwise the invalidation has no + * blocks to read and returns an error. In this case, just do the fork + * removal below. + */ + if (xfs_inode_hasattr(dp) && + dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) { error = xfs_attr3_root_inactive(&trans, dp); if (error) goto out_cancel; @@ -449,12 +453,12 @@ xfs_attr_inactive( /* Reset the attribute fork - this also destroys the in-core fork */ xfs_attr_fork_remove(dp, trans); - error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(trans); xfs_iunlock(dp, lock_mode); return error; out_cancel: - xfs_trans_cancel(trans, cancel_flags); + xfs_trans_cancel(trans); out_destroy_fork: /* kill the in-core attr fork before we drop the inode lock */ if (dp->i_afp) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index a52bbd3abc7d..3bf4ad0d19e4 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -67,78 +67,68 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) */ int /* error */ xfs_bmap_finish( - xfs_trans_t **tp, /* transaction pointer addr */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - int *committed) /* xact committed or not */ + struct xfs_trans **tp, /* transaction pointer addr */ + struct xfs_bmap_free *flist, /* i/o: list extents to free */ + int *committed)/* xact committed or not */ { - xfs_efd_log_item_t *efd; /* extent free data */ - xfs_efi_log_item_t *efi; /* extent free intention */ - int error; /* error return value */ - xfs_bmap_free_item_t *free; /* free extent item */ - struct xfs_trans_res tres; /* new log reservation */ - xfs_mount_t *mp; /* filesystem mount structure */ - xfs_bmap_free_item_t *next; /* next item on free list */ - xfs_trans_t *ntp; /* new transaction pointer */ + struct xfs_efd_log_item *efd; /* extent free data */ + struct xfs_efi_log_item *efi; /* extent free intention */ + int error; /* error return value */ + struct xfs_bmap_free_item *free; /* free extent item */ + struct xfs_bmap_free_item *next; /* next item on free list */ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); if (flist->xbf_count == 0) { *committed = 0; return 0; } - ntp = *tp; - efi = xfs_trans_get_efi(ntp, flist->xbf_count); + efi = xfs_trans_get_efi(*tp, flist->xbf_count); for (free = flist->xbf_first; free; free = free->xbfi_next) - xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, + xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock, free->xbfi_blockcount); - tres.tr_logres = ntp->t_log_res; - tres.tr_logcount = ntp->t_log_count; - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - ntp = xfs_trans_dup(*tp); - error = xfs_trans_commit(*tp, 0); - *tp = ntp; - *committed = 1; - /* - * We have a new transaction, so we should return committed=1, - * even though we're returning an error. - */ - if (error) + error = __xfs_trans_roll(tp, NULL, committed); + if (error) { + /* + * If the transaction was committed, drop the EFD reference + * since we're bailing out of here. The other reference is + * dropped when the EFI hits the AIL. + * + * If the transaction was not committed, the EFI is freed by the + * EFI item unlock handler on abort. Also, we have a new + * transaction so we should return committed=1 even though we're + * returning an error. + */ + if (*committed) { + xfs_efi_release(efi); + xfs_force_shutdown((*tp)->t_mountp, + (error == -EFSCORRUPTED) ? + SHUTDOWN_CORRUPT_INCORE : + SHUTDOWN_META_IO_ERROR); + } else { + *committed = 1; + } + return error; + } /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() + * Get an EFD and free each extent in the list, logging to the EFD in + * the process. The remaining bmap free list is cleaned up by the caller + * on error. */ - xfs_log_ticket_put(ntp->t_ticket); - - error = xfs_trans_reserve(ntp, &tres, 0, 0); - if (error) - return error; - efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); + efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count); for (free = flist->xbf_first; free != NULL; free = next) { next = free->xbfi_next; - if ((error = xfs_free_extent(ntp, free->xbfi_startblock, - free->xbfi_blockcount))) { - /* - * The bmap free list will be cleaned up at a - * higher level. The EFI will be canceled when - * this transaction is aborted. - * Need to force shutdown here to make sure it - * happens, since this transaction may not be - * dirty yet. - */ - mp = ntp->t_mountp; - if (!XFS_FORCED_SHUTDOWN(mp)) - xfs_force_shutdown(mp, - (error == -EFSCORRUPTED) ? - SHUTDOWN_CORRUPT_INCORE : - SHUTDOWN_META_IO_ERROR); + + error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock, + free->xbfi_blockcount); + if (error) return error; - } - xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, - free->xbfi_blockcount); + xfs_bmap_del_free(flist, NULL, free); } + return 0; } @@ -878,7 +868,7 @@ xfs_free_eofblocks( if (need_iolock) { if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return -EAGAIN; } } @@ -886,7 +876,7 @@ xfs_free_eofblocks( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); if (need_iolock) xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; @@ -908,12 +898,9 @@ xfs_free_eofblocks( * If we get an error at this point we simply don't * bother truncating the file. */ - xfs_trans_cancel(tp, - (XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT)); + xfs_trans_cancel(tp); } else { - error = xfs_trans_commit(tp, - XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (!error) xfs_inode_clear_eofblocks_tag(ip); } @@ -1026,7 +1013,7 @@ xfs_alloc_file_space( * Free the transaction structure. */ ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); break; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -1053,7 +1040,7 @@ xfs_alloc_file_space( goto error0; } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) { break; @@ -1077,7 +1064,7 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); error1: /* Just cancel transaction */ - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1133,14 +1120,29 @@ xfs_zero_remaining_bytes( break; ASSERT(imap.br_blockcount >= 1); ASSERT(imap.br_startoff == offset_fsb); + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + + if (imap.br_startblock == HOLESTARTBLOCK || + imap.br_state == XFS_EXT_UNWRITTEN) { + /* skip the entire extent */ + lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + + imap.br_blockcount) - 1; + continue; + } + lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; if (lastoffset > endoff) lastoffset = endoff; - if (imap.br_startblock == HOLESTARTBLOCK) - continue; - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - if (imap.br_state == XFS_EXT_UNWRITTEN) + + /* DAX can just zero the backing device directly */ + if (IS_DAX(VFS_I(ip))) { + error = dax_zero_page_range(VFS_I(ip), offset, + lastoffset - offset + 1, + xfs_get_blocks_direct); + if (error) + return error; continue; + } error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp, @@ -1289,7 +1291,7 @@ xfs_free_file_space( * Free the transaction structure. */ ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); break; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -1320,7 +1322,7 @@ xfs_free_file_space( goto error0; } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); } @@ -1330,7 +1332,7 @@ xfs_free_file_space( error0: xfs_bmap_cancel(&free_list); error1: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); goto out; } @@ -1462,7 +1464,7 @@ xfs_shift_file_space( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); break; } @@ -1472,7 +1474,7 @@ xfs_shift_file_space( XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, XFS_QMOPT_RES_REGBLKS); if (error) - goto out; + goto out_trans_cancel; xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); @@ -1486,19 +1488,21 @@ xfs_shift_file_space( &done, stop_fsb, &first_block, &free_list, direction, XFS_BMAP_MAX_SHIFT_EXTENTS); if (error) - goto out; + goto out_bmap_cancel; error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) - goto out; + goto out_bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); } return error; -out: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); +out_bmap_cancel: + xfs_bmap_cancel(&free_list); +out_trans_cancel: + xfs_trans_cancel(tp); return error; } @@ -1718,7 +1722,7 @@ xfs_swap_extents( tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out_unlock; } @@ -1901,7 +1905,7 @@ xfs_swap_extents( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); @@ -1915,6 +1919,6 @@ out_unlock: goto out; out_trans_cancel: - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 1790b00bea7a..8ecffb35935b 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -438,7 +438,6 @@ _xfs_buf_find( xfs_buf_flags_t flags, xfs_buf_t *new_bp) { - size_t numbytes; struct xfs_perag *pag; struct rb_node **rbp; struct rb_node *parent; @@ -450,10 +449,9 @@ _xfs_buf_find( for (i = 0; i < nmaps; i++) numblks += map[i].bm_len; - numbytes = BBTOB(numblks); /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(numbytes < btp->bt_meta_sectorsize)); + ASSERT(!(BBTOB(numblks) < btp->bt_meta_sectorsize)); ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask)); /* @@ -1096,8 +1094,7 @@ xfs_bwrite( STATIC void xfs_buf_bio_end_io( - struct bio *bio, - int error) + struct bio *bio) { xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; @@ -1105,10 +1102,10 @@ xfs_buf_bio_end_io( * don't overwrite existing errors - otherwise we can lose errors on * buffers that require multiple bios to complete. */ - if (error) { + if (bio->bi_error) { spin_lock(&bp->b_lock); if (!bp->b_io_error) - bp->b_io_error = error; + bp->b_io_error = bio->bi_error; spin_unlock(&bp->b_lock); } @@ -1419,9 +1416,9 @@ xfs_buf_submit_wait( return error; } -xfs_caddr_t +void * xfs_buf_offset( - xfs_buf_t *bp, + struct xfs_buf *bp, size_t offset) { struct page *page; @@ -1431,7 +1428,7 @@ xfs_buf_offset( offset += bp->b_offset; page = bp->b_pages[offset >> PAGE_SHIFT]; - return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1)); + return page_address(page) + (offset & (PAGE_SIZE-1)); } /* @@ -1533,9 +1530,10 @@ xfs_wait_buftarg( list_del_init(&bp->b_lru); if (bp->b_flags & XBF_WRITE_FAIL) { xfs_alert(btp->bt_mount, -"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n" -"Please run xfs_repair to determine the extent of the problem.", +"Corruption Alert: Buffer at block 0x%llx had permanent write failures!", (long long)bp->b_bn); + xfs_alert(btp->bt_mount, +"Please run xfs_repair to determine the extent of the problem."); } xfs_buf_rele(bp); } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 75ff5d5a7d2e..c79b717d9b88 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -23,6 +23,7 @@ #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/fs.h> +#include <linux/dax.h> #include <linux/buffer_head.h> #include <linux/uio.h> #include <linux/list_lru.h> @@ -299,7 +300,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) /* Buffer Utility Routines */ -extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); +extern void *xfs_buf_offset(struct xfs_buf *, size_t); /* Delayed Write Buffer Routines */ extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 092d652bc03d..7e986da34f6c 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -647,11 +647,7 @@ xfs_buf_item_unlock( xfs_buf_item_relse(bp); else if (aborted) { ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); - if (lip->li_flags & XFS_LI_IN_AIL) { - spin_lock(&lip->li_ailp->xa_lock); - xfs_trans_ail_delete(lip->li_ailp, lip, - SHUTDOWN_LOG_IO_ERROR); - } + xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); } } @@ -750,13 +746,13 @@ xfs_buf_item_free_format( * buffer (see xfs_buf_attach_iodone() below), then put the * buf log item at the front. */ -void +int xfs_buf_item_init( - xfs_buf_t *bp, - xfs_mount_t *mp) + struct xfs_buf *bp, + struct xfs_mount *mp) { - xfs_log_item_t *lip = bp->b_fspriv; - xfs_buf_log_item_t *bip; + struct xfs_log_item *lip = bp->b_fspriv; + struct xfs_buf_log_item *bip; int chunks; int map_size; int error; @@ -770,12 +766,11 @@ xfs_buf_item_init( */ ASSERT(bp->b_target->bt_mount == mp); if (lip != NULL && lip->li_type == XFS_LI_BUF) - return; + return 0; bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; - xfs_buf_hold(bp); /* * chunks is the number of XFS_BLF_CHUNK size pieces the buffer @@ -788,6 +783,11 @@ xfs_buf_item_init( */ error = xfs_buf_item_get_format(bip, bp->b_map_count); ASSERT(error == 0); + if (error) { /* to stop gcc throwing set-but-unused warnings */ + kmem_zone_free(xfs_buf_item_zone, bip); + return error; + } + for (i = 0; i < bip->bli_format_count; i++) { chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), @@ -807,6 +807,8 @@ xfs_buf_item_init( if (bp->b_fspriv) bip->bli_item.li_bio_list = bp->b_fspriv; bp->b_fspriv = bip; + xfs_buf_hold(bp); + return 0; } diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 3f3455a41510..f7eba99d19dd 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -61,7 +61,7 @@ typedef struct xfs_buf_log_item { struct xfs_buf_log_format __bli_format; /* embedded in-log header */ } xfs_buf_log_item_t; -void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); +int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_relse(struct xfs_buf *); void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); uint xfs_buf_item_dirty(xfs_buf_log_item_t *); diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 098cd78fe708..a989a9c7edb7 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -171,6 +171,7 @@ xfs_dir2_block_getdents( int wantoff; /* starting block offset */ xfs_off_t cook; struct xfs_da_geometry *geo = args->geo; + int lock_mode; /* * If the block number in the offset is out of range, we're done. @@ -178,7 +179,9 @@ xfs_dir2_block_getdents( if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) return 0; + lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir3_block_read(NULL, dp, &bp); + xfs_iunlock(dp, lock_mode); if (error) return error; @@ -529,9 +532,12 @@ xfs_dir2_leaf_getdents( * current buffer, need to get another one. */ if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) { + int lock_mode; + lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir2_leaf_readbuf(args, bufsize, map_info, &curoff, &bp); + xfs_iunlock(dp, lock_mode); if (error || !map_info->map_valid) break; @@ -653,7 +659,6 @@ xfs_readdir( struct xfs_da_args args = { NULL }; int rval; int v; - uint lock_mode; trace_xfs_readdir(dp); @@ -666,7 +671,7 @@ xfs_readdir( args.dp = dp; args.geo = dp->i_mount->m_dir_geo; - lock_mode = xfs_ilock_data_map_shared(dp); + xfs_ilock(dp, XFS_IOLOCK_SHARED); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_getdents(&args, ctx); else if ((rval = xfs_dir2_isblock(&args, &v))) @@ -675,7 +680,7 @@ xfs_readdir( rval = xfs_dir2_block_getdents(&args, ctx); else rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize); - xfs_iunlock(dp, lock_mode); + xfs_iunlock(dp, XFS_IOLOCK_SHARED); return rval; } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 02c01bbbc789..30cb3afb67f0 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -251,7 +251,7 @@ xfs_qm_init_dquot_blk( d->dd_diskdq.d_id = cpu_to_be32(curid); d->dd_diskdq.d_flags = type; if (xfs_sb_version_hascrc(&mp->m_sb)) { - uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid); xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } @@ -568,8 +568,6 @@ xfs_qm_dqread( struct xfs_buf *bp; struct xfs_trans *tp = NULL; int error; - int cancelflags = 0; - dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP); @@ -617,7 +615,6 @@ xfs_qm_dqread( XFS_QM_DQALLOC_SPACE_RES(mp), 0); if (error) goto error1; - cancelflags = XFS_TRANS_RELEASE_LOG_RES; } /* @@ -632,7 +629,6 @@ xfs_qm_dqread( * allocate (ENOENT). */ trace_xfs_dqread_fail(dqp); - cancelflags |= XFS_TRANS_ABORT; goto error1; } @@ -670,7 +666,7 @@ xfs_qm_dqread( xfs_trans_brelse(tp, bp); if (tp) { - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto error0; } @@ -680,7 +676,7 @@ xfs_qm_dqread( error1: if (tp) - xfs_trans_cancel(tp, cancelflags); + xfs_trans_cancel(tp); error0: xfs_qm_dqdestroy(dqp); *O_dqpp = NULL; @@ -958,12 +954,8 @@ xfs_qm_dqflush( struct xfs_log_item *lip = &dqp->q_logitem.qli_item; dqp->dq_flags &= ~XFS_DQ_DIRTY; - spin_lock(&mp->m_ail->xa_lock); - if (lip->li_flags & XFS_LI_IN_AIL) - xfs_trans_ail_delete(mp->m_ail, lip, - SHUTDOWN_CORRUPT_INCORE); - else - spin_unlock(&mp->m_ail->xa_lock); + xfs_trans_ail_remove(lip, SHUTDOWN_CORRUPT_INCORE); + error = -EIO; goto out_unlock; } diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 338e50bbfd1e..74d0e5966ebc 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -127,7 +127,7 @@ xfs_error_report( struct xfs_mount *mp, const char *filename, int linenum, - inst_t *ra) + void *ra) { if (level <= xfs_error_level) { xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, @@ -146,7 +146,7 @@ xfs_corruption_error( void *p, const char *filename, int linenum, - inst_t *ra) + void *ra) { if (level <= xfs_error_level) xfs_hex_dump(p, 64); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index c0394ed126fc..4ed3042a0f16 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -21,10 +21,10 @@ struct xfs_mount; extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, - const char *filename, int linenum, inst_t *ra); + const char *filename, int linenum, void *ra); extern void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp, void *p, const char *filename, - int linenum, inst_t *ra); + int linenum, void *ra); extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_ERROR_REPORT(e, lvl, mp) \ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index cb7fe64cdbfa..4aa0153214f9 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -47,28 +47,6 @@ xfs_efi_item_free( } /* - * Freeing the efi requires that we remove it from the AIL if it has already - * been placed there. However, the EFI may not yet have been placed in the AIL - * when called by xfs_efi_release() from EFD processing due to the ordering of - * committed vs unpin operations in bulk insert operations. Hence the reference - * count to ensure only the last caller frees the EFI. - */ -STATIC void -__xfs_efi_release( - struct xfs_efi_log_item *efip) -{ - struct xfs_ail *ailp = efip->efi_item.li_ailp; - - if (atomic_dec_and_test(&efip->efi_refcount)) { - spin_lock(&ailp->xa_lock); - /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, &efip->efi_item, - SHUTDOWN_LOG_IO_ERROR); - xfs_efi_item_free(efip); - } -} - -/* * This returns the number of iovecs needed to log the given efi item. * We only need 1 iovec for an efi item. It just logs the efi_log_format * structure. @@ -128,12 +106,12 @@ xfs_efi_item_pin( } /* - * While EFIs cannot really be pinned, the unpin operation is the last place at - * which the EFI is manipulated during a transaction. If we are being asked to - * remove the EFI it's because the transaction has been cancelled and by - * definition that means the EFI cannot be in the AIL so remove it from the - * transaction and free it. Otherwise coordinate with xfs_efi_release() - * to determine who gets to free the EFI. + * The unpin operation is the last place an EFI is manipulated in the log. It is + * either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the EFI transaction has been successfully committed to make it + * this far. Therefore, we expect whoever committed the EFI to either construct + * and commit the EFD or drop the EFD's reference in the event of error. Simply + * drop the log's EFI reference now that the log is done with it. */ STATIC void xfs_efi_item_unpin( @@ -141,15 +119,7 @@ xfs_efi_item_unpin( int remove) { struct xfs_efi_log_item *efip = EFI_ITEM(lip); - - if (remove) { - ASSERT(!(lip->li_flags & XFS_LI_IN_AIL)); - if (lip->li_desc) - xfs_trans_del_item(lip); - xfs_efi_item_free(efip); - return; - } - __xfs_efi_release(efip); + xfs_efi_release(efip); } /* @@ -167,6 +137,11 @@ xfs_efi_item_push( return XFS_ITEM_PINNED; } +/* + * The EFI has been either committed or aborted if the transaction has been + * cancelled. If the transaction was cancelled, an EFD isn't going to be + * constructed and thus we free the EFI here directly. + */ STATIC void xfs_efi_item_unlock( struct xfs_log_item *lip) @@ -239,7 +214,7 @@ xfs_efi_init( xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); efip->efi_format.efi_nextents = nextents; - efip->efi_format.efi_id = (__psint_t)(void*)efip; + efip->efi_format.efi_id = (uintptr_t)(void *)efip; atomic_set(&efip->efi_next_extent, 0); atomic_set(&efip->efi_refcount, 2); @@ -301,23 +276,19 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) } /* - * This is called by the efd item code below to release references to the given - * efi item. Each efd calls this with the number of extents that it has - * logged, and when the sum of these reaches the total number of extents logged - * by this efi item we can free the efi item. + * Freeing the efi requires that we remove it from the AIL if it has already + * been placed there. However, the EFI may not yet have been placed in the AIL + * when called by xfs_efi_release() from EFD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the EFI. */ void -xfs_efi_release(xfs_efi_log_item_t *efip, - uint nextents) +xfs_efi_release( + struct xfs_efi_log_item *efip) { - ASSERT(atomic_read(&efip->efi_next_extent) >= nextents); - if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) { - /* recovery needs us to drop the EFI reference, too */ - if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) - __xfs_efi_release(efip); - - __xfs_efi_release(efip); - /* efip may now have been freed, do not reference it again. */ + if (atomic_dec_and_test(&efip->efi_refcount)) { + xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); + xfs_efi_item_free(efip); } } @@ -415,20 +386,27 @@ xfs_efd_item_push( return XFS_ITEM_PINNED; } +/* + * The EFD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the EFI and free the EFD. + */ STATIC void xfs_efd_item_unlock( struct xfs_log_item *lip) { - if (lip->li_flags & XFS_LI_ABORTED) - xfs_efd_item_free(EFD_ITEM(lip)); + struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + + if (lip->li_flags & XFS_LI_ABORTED) { + xfs_efi_release(efdp->efd_efip); + xfs_efd_item_free(efdp); + } } /* - * When the efd item is committed to disk, all we need to do - * is delete our reference to our partner efi item and then - * free ourselves. Since we're freeing ourselves we must - * return -1 to keep the transaction code from further referencing - * this item. + * When the efd item is committed to disk, all we need to do is delete our + * reference to our partner efi item and then free ourselves. Since we're + * freeing ourselves we must return -1 to keep the transaction code from further + * referencing this item. */ STATIC xfs_lsn_t xfs_efd_item_committed( @@ -438,13 +416,14 @@ xfs_efd_item_committed( struct xfs_efd_log_item *efdp = EFD_ITEM(lip); /* - * If we got a log I/O error, it's always the case that the LR with the - * EFI got unpinned and freed before the EFD got aborted. + * Drop the EFI reference regardless of whether the EFD has been + * aborted. Once the EFD transaction is constructed, it is the sole + * responsibility of the EFD to release the EFI (even if the EFI is + * aborted due to log I/O error). */ - if (!(lip->li_flags & XFS_LI_ABORTED)) - xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents); - + xfs_efi_release(efdp->efd_efip); xfs_efd_item_free(efdp); + return (xfs_lsn_t)-1; } diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 0ffbce32d569..8fa8651705e1 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -39,9 +39,28 @@ struct kmem_zone; * "extent free done" log item described below. * * The EFI is reference counted so that it is not freed prior to both the EFI - * and EFD being committed and unpinned. This ensures that when the last - * reference goes away the EFI will always be in the AIL as it has been - * unpinned, regardless of whether the EFD is processed before or after the EFI. + * and EFD being committed and unpinned. This ensures the EFI is inserted into + * the AIL even in the event of out of order EFI/EFD processing. In other words, + * an EFI is born with two references: + * + * 1.) an EFI held reference to track EFI AIL insertion + * 2.) an EFD held reference to track EFD commit + * + * On allocation, both references are the responsibility of the caller. Once the + * EFI is added to and dirtied in a transaction, ownership of reference one + * transfers to the transaction. The reference is dropped once the EFI is + * inserted to the AIL or in the event of failure along the way (e.g., commit + * failure, log I/O error, etc.). Note that the caller remains responsible for + * the EFD reference under all circumstances to this point. The caller has no + * means to detect failure once the transaction is committed, however. + * Therefore, an EFD is required after this point, even in the event of + * unrelated failure. + * + * Once an EFD is allocated and dirtied in a transaction, reference two + * transfers to the transaction. The EFD reference is dropped once it reaches + * the unpin handler. Similar to the EFI, the reference also drops in the event + * of commit failure or log I/O errors. Note that the EFD is not inserted in the + * AIL, so at this point both the EFI and EFD are freed. */ typedef struct xfs_efi_log_item { xfs_log_item_t efi_item; @@ -77,5 +96,6 @@ xfs_efd_log_item_t *xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *, int xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt); void xfs_efi_item_free(xfs_efi_log_item_t *); +void xfs_efi_release(struct xfs_efi_log_item *); #endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7c62fca53e2f..e78feb400e22 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -80,14 +80,15 @@ xfs_rw_ilock_demote( } /* - * xfs_iozero + * xfs_iozero clears the specified range supplied via the page cache (except in + * the DAX case). Writes through the page cache will allocate blocks over holes, + * though the callers usually map the holes first and avoid them. If a block is + * not completely zeroed, then it will be read from disk before being partially + * zeroed. * - * xfs_iozero clears the specified range of buffer supplied, - * and marks all the affected blocks as valid and modified. If - * an affected block is not allocated, it will be allocated. If - * an affected block is not completely overwritten, and is not - * valid before the operation, it will be read from disk before - * being partially zeroed. + * In the DAX case, we can just directly write to the underlying pages. This + * will not allocate blocks, but will avoid holes and unwritten extents and so + * not do unnecessary work. */ int xfs_iozero( @@ -97,7 +98,8 @@ xfs_iozero( { struct page *page; struct address_space *mapping; - int status; + int status = 0; + mapping = VFS_I(ip)->i_mapping; do { @@ -109,20 +111,27 @@ xfs_iozero( if (bytes > count) bytes = count; - status = pagecache_write_begin(NULL, mapping, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); - if (status) - break; + if (IS_DAX(VFS_I(ip))) { + status = dax_zero_page_range(VFS_I(ip), pos, bytes, + xfs_get_blocks_direct); + if (status) + break; + } else { + status = pagecache_write_begin(NULL, mapping, pos, bytes, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (status) + break; - zero_user(page, offset, bytes); + zero_user(page, offset, bytes); - status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, - page, fsdata); - WARN_ON(status <= 0); /* can't return less than zero! */ + status = pagecache_write_end(NULL, mapping, pos, bytes, + bytes, page, fsdata); + WARN_ON(status <= 0); /* can't return less than zero! */ + status = 0; + } pos += bytes; count -= bytes; - status = 0; } while (count); return status; @@ -139,7 +148,7 @@ xfs_update_prealloc_flags( tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID); error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -161,7 +170,7 @@ xfs_update_prealloc_flags( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (flags & XFS_PREALLOC_SYNC) xfs_trans_set_sync(tp); - return xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp); } /* @@ -285,7 +294,7 @@ xfs_file_read_iter( if (file->f_mode & FMODE_NOCMTIME) ioflags |= XFS_IO_INVIS; - if (unlikely(ioflags & XFS_IO_ISDIRECT)) { + if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -308,24 +317,33 @@ xfs_file_read_iter( return -EIO; /* - * Locking is a bit tricky here. If we take an exclusive lock - * for direct IO, we effectively serialise all new concurrent - * read IO to this file and block it behind IO that is currently in - * progress because IO in progress holds the IO lock shared. We only - * need to hold the lock exclusive to blow away the page cache, so - * only take lock exclusively if the page cache needs invalidation. - * This allows the normal direct IO case of no page cache pages to - * proceeed concurrently without serialisation. + * Locking is a bit tricky here. If we take an exclusive lock for direct + * IO, we effectively serialise all new concurrent read IO to this file + * and block it behind IO that is currently in progress because IO in + * progress holds the IO lock shared. We only need to hold the lock + * exclusive to blow away the page cache, so only take lock exclusively + * if the page cache needs invalidation. This allows the normal direct + * IO case of no page cache pages to proceeed concurrently without + * serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); + /* + * The generic dio code only flushes the range of the particular + * I/O. Because we take an exclusive lock here, this whole + * sequence is considerably more expensive for us. This has a + * noticeable performance impact for any file with cached pages, + * even when outside of the range of the particular I/O. + * + * Hence, amortize the cost of the lock against a full file + * flush and reduce the chances of repeated iolock cycles going + * forward. + */ if (inode->i_mapping->nrpages) { - ret = filemap_write_and_wait_range( - VFS_I(ip)->i_mapping, - pos, pos + size - 1); + ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; @@ -336,9 +354,7 @@ xfs_file_read_iter( * we fail to invalidate a page, but this should never * happen on XFS. Warn if it does fail. */ - ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, - pos >> PAGE_CACHE_SHIFT, - (pos + size - 1) >> PAGE_CACHE_SHIFT); + ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); WARN_ON_ONCE(ret); ret = 0; } @@ -379,7 +395,11 @@ xfs_file_splice_read( trace_xfs_file_splice_read(ip, count, *ppos, ioflags); - ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); + /* for dax, we need to avoid the page cache */ + if (IS_DAX(VFS_I(ip))) + ret = default_file_splice_read(infilp, ppos, pipe, count, flags); + else + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -564,6 +584,13 @@ restart: if (error) return error; + /* For changing security info in file_remove_privs() we need i_mutex */ + if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { + xfs_rw_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, *iolock); + goto restart; + } /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this @@ -624,7 +651,9 @@ restart: * setgid bits if the process is not being run by root. This keeps * people from modifying setuid and setgid binaries. */ - return file_remove_suid(file); + if (!IS_NOSEC(inode)) + return file_remove_privs(file); + return 0; } /* @@ -673,7 +702,7 @@ xfs_file_dio_aio_write( mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ - if ((pos | count) & target->bt_logical_sectormask) + if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask)) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ @@ -711,19 +740,19 @@ xfs_file_dio_aio_write( pos = iocb->ki_pos; end = pos + count - 1; + /* + * See xfs_file_read_iter() for why we do a full-file flush here. + */ if (mapping->nrpages) { - ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - pos, end); + ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); if (ret) goto out; /* - * Invalidate whole pages. This can return an error if - * we fail to invalidate a page, but this should never - * happen on XFS. Warn if it does fail. + * Invalidate whole pages. This can return an error if we fail + * to invalidate a page, but this should never happen on XFS. + * Warn if it does fail. */ - ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, - pos >> PAGE_CACHE_SHIFT, - end >> PAGE_CACHE_SHIFT); + ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); WARN_ON_ONCE(ret); ret = 0; } @@ -759,8 +788,11 @@ xfs_file_dio_aio_write( out: xfs_rw_iunlock(ip, iolock); - /* No fallback to buffered IO on errors for XFS. */ - ASSERT(ret < 0 || ret == count); + /* + * No fallback to buffered IO on errors for XFS. DAX can result in + * partial writes, but direct IO will either complete fully or fail. + */ + ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); return ret; } @@ -843,7 +875,7 @@ xfs_file_write_iter( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if (unlikely(iocb->ki_flags & IOCB_DIRECT)) + if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) ret = xfs_file_dio_aio_write(iocb, from); else ret = xfs_file_buffered_aio_write(iocb, from); @@ -1064,17 +1096,6 @@ xfs_file_readdir( return xfs_readdir(ip, ctx, bufsize); } -STATIC int -xfs_file_mmap( - struct file *filp, - struct vm_area_struct *vma) -{ - vma->vm_ops = &xfs_file_vm_ops; - - file_accessed(filp); - return 0; -} - /* * This type is designed to indicate the type of offset we would like * to search from page cache for xfs_seek_hole_data(). @@ -1455,48 +1476,120 @@ xfs_file_llseek( * ordering of: * * mmap_sem (MM) - * i_mmap_lock (XFS - truncate serialisation) - * page_lock (MM) - * i_lock (XFS - extent map serialisation) + * sb_start_pagefault(vfs, freeze) + * i_mmap_lock (XFS - truncate serialisation) + * page_lock (MM) + * i_lock (XFS - extent map serialisation) + */ + +/* + * mmap()d file has taken write protection fault and is being made writable. We + * can set the page state up correctly for a writable page, which means we can + * do correct delalloc accounting (ENOSPC checking!) and unwritten extent + * mapping. */ STATIC int -xfs_filemap_fault( +xfs_filemap_page_mkwrite( struct vm_area_struct *vma, struct vm_fault *vmf) { - struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); - int error; + struct inode *inode = file_inode(vma->vm_file); + int ret; - trace_xfs_filemap_fault(ip); + trace_xfs_filemap_page_mkwrite(XFS_I(inode)); - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - error = filemap_fault(vma, vmf); - xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - return error; + if (IS_DAX(inode)) { + ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct, + xfs_end_io_dax_write); + } else { + ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); + ret = block_page_mkwrite_return(ret); + } + + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + sb_end_pagefault(inode->i_sb); + + return ret; } -/* - * mmap()d file has taken write protection fault and is being made writable. We - * can set the page state up correctly for a writable page, which means we can - * do correct delalloc accounting (ENOSPC checking!) and unwritten extent - * mapping. - */ STATIC int -xfs_filemap_page_mkwrite( +xfs_filemap_fault( struct vm_area_struct *vma, struct vm_fault *vmf) { - struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); - int error; + struct inode *inode = file_inode(vma->vm_file); + int ret; - trace_xfs_filemap_page_mkwrite(ip); + trace_xfs_filemap_fault(XFS_I(inode)); - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - error = block_page_mkwrite(vma, vmf, xfs_get_blocks); - xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + /* DAX can shortcut the normal fault path on write faults! */ + if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode)) + return xfs_filemap_page_mkwrite(vma, vmf); - return error; + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + if (IS_DAX(inode)) { + /* + * we do not want to trigger unwritten extent conversion on read + * faults - that is unnecessary overhead and would also require + * changes to xfs_get_blocks_direct() to map unwritten extent + * ioend for conversion on read-only mappings. + */ + ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL); + } else + ret = filemap_fault(vma, vmf); + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + + return ret; +} + +STATIC int +xfs_filemap_pmd_fault( + struct vm_area_struct *vma, + unsigned long addr, + pmd_t *pmd, + unsigned int flags) +{ + struct inode *inode = file_inode(vma->vm_file); + struct xfs_inode *ip = XFS_I(inode); + int ret; + + if (!IS_DAX(inode)) + return VM_FAULT_FALLBACK; + + trace_xfs_filemap_pmd_fault(ip); + + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct, + xfs_end_io_dax_write); + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + sb_end_pagefault(inode->i_sb); + + return ret; +} + +static const struct vm_operations_struct xfs_file_vm_ops = { + .fault = xfs_filemap_fault, + .pmd_fault = xfs_filemap_pmd_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = xfs_filemap_page_mkwrite, +}; + +STATIC int +xfs_file_mmap( + struct file *filp, + struct vm_area_struct *vma) +{ + file_accessed(filp); + vma->vm_ops = &xfs_file_vm_ops; + if (IS_DAX(file_inode(filp))) + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + return 0; } const struct file_operations xfs_file_operations = { @@ -1527,9 +1620,3 @@ const struct file_operations xfs_dir_file_operations = { #endif .fsync = xfs_dir_fsync, }; - -static const struct vm_operations_struct xfs_file_vm_ops = { - .fault = xfs_filemap_fault, - .map_pages = filemap_map_pages, - .page_mkwrite = xfs_filemap_page_mkwrite, -}; diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index da82f1cb4b9b..c4c130f9bfb6 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -196,7 +196,8 @@ xfs_filestream_pick_ag( goto next_ag; } - longest = xfs_alloc_longest_free_extent(mp, pag); + longest = xfs_alloc_longest_free_extent(mp, pag, + xfs_alloc_min_freelist(mp, pag)); if (((minlen && longest >= minlen) || (!minlen && pag->pagf_freeblks >= minfree)) && (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index cb7e8a29dfb6..ee3aaa0a5317 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -101,7 +101,9 @@ xfs_fs_geometry( (xfs_sb_version_hasftype(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_FTYPE : 0) | (xfs_sb_version_hasfinobt(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_FINOBT : 0); + XFS_FSOP_GEOM_FLAGS_FINOBT : 0) | + (xfs_sb_version_hassparseinodes(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_SPINODES : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; @@ -201,7 +203,7 @@ xfs_growfs_data_private( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata, XFS_GROWFS_SPACE_RES(mp), 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -248,7 +250,7 @@ xfs_growfs_data_private( agf->agf_freeblks = cpu_to_be32(tmpsize); agf->agf_longest = cpu_to_be32(tmpsize); if (xfs_sb_version_hascrc(&mp->m_sb)) - uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); error = xfs_bwrite(bp); xfs_buf_relse(bp); @@ -271,7 +273,7 @@ xfs_growfs_data_private( if (xfs_sb_version_hascrc(&mp->m_sb)) { agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); agfl->agfl_seqno = cpu_to_be32(agno); - uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); } agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); @@ -307,7 +309,7 @@ xfs_growfs_data_private( agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); if (xfs_sb_version_hascrc(&mp->m_sb)) - uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid); + uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid); if (xfs_sb_version_hasfinobt(&mp->m_sb)) { agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp)); agi->agi_free_level = cpu_to_be32(1); @@ -489,7 +491,7 @@ xfs_growfs_data_private( if (dpct) xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) return error; @@ -557,7 +559,7 @@ xfs_growfs_data_private( return saved_error ? saved_error : error; error0: - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 76a9f2783282..0a326bd64d4e 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -412,6 +412,8 @@ xfs_iget( if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) return -EINVAL; + XFS_STATS_INC(xs_ig_attempts); + /* get the perag structure and ensure that it's inode capable */ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); agino = XFS_INO_TO_AGINO(mp, ino); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 539a85fddbc2..dc40a6d5ae0d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -164,7 +164,7 @@ xfs_ilock( (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); if (lock_flags & XFS_IOLOCK_EXCL) mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); @@ -212,7 +212,7 @@ xfs_ilock_nowait( (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); if (lock_flags & XFS_IOLOCK_EXCL) { if (!mrtryupdate(&ip->i_iolock)) @@ -281,7 +281,7 @@ xfs_iunlock( (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); ASSERT(lock_flags != 0); if (lock_flags & XFS_IOLOCK_EXCL) @@ -363,31 +363,57 @@ int xfs_lock_delays; #endif /* + * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when + * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined + * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build + * errors and warnings. + */ +#if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP) +static bool +xfs_lockdep_subclass_ok( + int subclass) +{ + return subclass < MAX_LOCKDEP_SUBCLASSES; +} +#else +#define xfs_lockdep_subclass_ok(subclass) (true) +#endif + +/* * Bump the subclass so xfs_lock_inodes() acquires each lock with a different - * value. This shouldn't be called for page fault locking, but we also need to - * ensure we don't overrun the number of lockdep subclasses for the iolock or - * mmaplock as that is limited to 12 by the mmap lock lockdep annotations. + * value. This can be called for any type of inode lock combination, including + * parent locking. Care must be taken to ensure we don't overrun the subclass + * storage fields in the class mask we build. */ static inline int xfs_lock_inumorder(int lock_mode, int subclass) { + int class = 0; + + ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP | + XFS_ILOCK_RTSUM))); + ASSERT(xfs_lockdep_subclass_ok(subclass)); + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { - ASSERT(subclass + XFS_LOCK_INUMORDER < - (1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT))); - lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; + ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS); + ASSERT(xfs_lockdep_subclass_ok(subclass + + XFS_IOLOCK_PARENT_VAL)); + class += subclass << XFS_IOLOCK_SHIFT; + if (lock_mode & XFS_IOLOCK_PARENT) + class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT; } if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { - ASSERT(subclass + XFS_LOCK_INUMORDER < - (1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT))); - lock_mode |= (subclass + XFS_LOCK_INUMORDER) << - XFS_MMAPLOCK_SHIFT; + ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS); + class += subclass << XFS_MMAPLOCK_SHIFT; } - if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) - lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; + if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) { + ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS); + class += subclass << XFS_ILOCK_SHIFT; + } - return lock_mode; + return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class; } /* @@ -399,6 +425,11 @@ xfs_lock_inumorder(int lock_mode, int subclass) * transaction (such as truncate). This can result in deadlock since the long * running trans might need to wait for the inode we just locked in order to * push the tail and free space in the log. + * + * xfs_lock_inodes() can only be used to lock one type of lock at a time - + * the iolock, the mmaplock or the ilock, but not more than one at a time. If we + * lock more than one at a time, lockdep will report false positives saying we + * have violated locking orders. */ void xfs_lock_inodes( @@ -409,8 +440,29 @@ xfs_lock_inodes( int attempts = 0, i, j, try_lock; xfs_log_item_t *lp; - /* currently supports between 2 and 5 inodes */ + /* + * Currently supports between 2 and 5 inodes with exclusive locking. We + * support an arbitrary depth of locking here, but absolute limits on + * inodes depend on the the type of locking and the limits placed by + * lockdep annotations in xfs_lock_inumorder. These are all checked by + * the asserts. + */ ASSERT(ips && inodes >= 2 && inodes <= 5); + ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL | + XFS_ILOCK_EXCL)); + ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED | + XFS_ILOCK_SHARED))); + ASSERT(!(lock_mode & XFS_IOLOCK_EXCL) || + inodes <= XFS_IOLOCK_MAX_SUBCLASS + 1); + ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) || + inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1); + ASSERT(!(lock_mode & XFS_ILOCK_EXCL) || + inodes <= XFS_ILOCK_MAX_SUBCLASS + 1); + + if (lock_mode & XFS_IOLOCK_EXCL) { + ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL))); + } else if (lock_mode & XFS_MMAPLOCK_EXCL) + ASSERT(!(lock_mode & XFS_ILOCK_EXCL)); try_lock = 0; i = 0; @@ -629,30 +681,29 @@ xfs_lookup( { xfs_ino_t inum; int error; - uint lock_mode; trace_xfs_lookup(dp, name); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - lock_mode = xfs_ilock_data_map_shared(dp); + xfs_ilock(dp, XFS_IOLOCK_SHARED); error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); - xfs_iunlock(dp, lock_mode); - if (error) - goto out; + goto out_unlock; error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); if (error) goto out_free_name; + xfs_iunlock(dp, XFS_IOLOCK_SHARED); return 0; out_free_name: if (ci_name) kmem_free(ci_name->name); -out: +out_unlock: + xfs_iunlock(dp, XFS_IOLOCK_SHARED); *ipp = NULL; return error; } @@ -787,7 +838,7 @@ xfs_ialloc( if (ip->i_d.di_version == 3) { ASSERT(ip->i_d.di_ino == ino); - ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid)); + ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid)); ip->i_d.di_crc = 0; ip->i_d.di_changecount = 1; ip->i_d.di_lsn = 0; @@ -905,7 +956,6 @@ xfs_dir_ialloc( { xfs_trans_t *tp; - xfs_trans_t *ntp; xfs_inode_t *ip; xfs_buf_t *ialloc_context = NULL; int code; @@ -954,8 +1004,6 @@ xfs_dir_ialloc( * to succeed the second time. */ if (ialloc_context) { - struct xfs_trans_res tres; - /* * Normally, xfs_trans_commit releases all the locks. * We call bhold to hang on to the ialloc_context across @@ -964,12 +1012,6 @@ xfs_dir_ialloc( * allocation group. */ xfs_trans_bhold(tp, ialloc_context); - /* - * Save the log reservation so we can use - * them in the next transaction. - */ - tres.tr_logres = xfs_trans_get_log_res(tp); - tres.tr_logcount = xfs_trans_get_log_count(tp); /* * We want the quota changes to be associated with the next @@ -985,35 +1027,9 @@ xfs_dir_ialloc( tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); } - ntp = xfs_trans_dup(tp); - code = xfs_trans_commit(tp, 0); - tp = ntp; - if (committed != NULL) { + code = xfs_trans_roll(&tp, 0); + if (committed != NULL) *committed = 1; - } - /* - * If we get an error during the commit processing, - * release the buffer that is still held and return - * to the caller. - */ - if (code) { - xfs_buf_relse(ialloc_context); - if (dqinfo) { - tp->t_dqinfo = dqinfo; - xfs_trans_free_dqinfo(tp); - } - *tpp = ntp; - *ipp = NULL; - return code; - } - - /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(tp->t_ticket); - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - code = xfs_trans_reserve(tp, &tres, 0, 0); /* * Re-attach the quota info that we detached from prev trx. @@ -1025,7 +1041,7 @@ xfs_dir_ialloc( if (code) { xfs_buf_relse(ialloc_context); - *tpp = ntp; + *tpp = tp; *ipp = NULL; return code; } @@ -1127,7 +1143,6 @@ xfs_create( xfs_bmap_free_t free_list; xfs_fsblock_t first_block; bool unlock_dp_on_error = false; - uint cancel_flags; int committed; prid_t prid; struct xfs_dquot *udqp = NULL; @@ -1164,8 +1179,6 @@ xfs_create( tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); } - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - /* * Initially assume that the file does not exist and * reserve the resources for that case. If that is not @@ -1183,12 +1196,12 @@ xfs_create( resblks = 0; error = xfs_trans_reserve(tp, tres, 0, 0); } - if (error) { - cancel_flags = 0; + if (error) goto out_trans_cancel; - } - xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + + xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | + XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); unlock_dp_on_error = true; xfs_bmap_init(&free_list, &first_block); @@ -1214,11 +1227,8 @@ xfs_create( */ error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, resblks > 0, &ip, &committed); - if (error) { - if (error == -ENOSPC) - goto out_trans_cancel; - goto out_trans_abort; - } + if (error) + goto out_trans_cancel; /* * Now we join the directory inode to the transaction. We do not do it @@ -1227,7 +1237,7 @@ xfs_create( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); unlock_dp_on_error = false; error = xfs_dir_createname(tp, dp, name, ip->i_ino, @@ -1235,7 +1245,7 @@ xfs_create( resblks - XFS_IALLOC_SPACE_RES(mp) : 0); if (error) { ASSERT(error != -ENOSPC); - goto out_trans_abort; + goto out_trans_cancel; } xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); @@ -1269,7 +1279,7 @@ xfs_create( if (error) goto out_bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto out_release_inode; @@ -1282,10 +1292,8 @@ xfs_create( out_bmap_cancel: xfs_bmap_cancel(&free_list); - out_trans_abort: - cancel_flags |= XFS_TRANS_ABORT; out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); out_release_inode: /* * Wait until after the current transaction is aborted to finish the @@ -1302,7 +1310,7 @@ xfs_create( xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); return error; } @@ -1317,7 +1325,6 @@ xfs_create_tmpfile( struct xfs_inode *ip = NULL; struct xfs_trans *tp = NULL; int error; - uint cancel_flags = XFS_TRANS_RELEASE_LOG_RES; prid_t prid; struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; @@ -1350,10 +1357,8 @@ xfs_create_tmpfile( resblks = 0; error = xfs_trans_reserve(tp, tres, 0, 0); } - if (error) { - cancel_flags = 0; + if (error) goto out_trans_cancel; - } error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, pdqp, resblks, 1, 0); @@ -1362,11 +1367,8 @@ xfs_create_tmpfile( error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, resblks > 0, &ip, NULL); - if (error) { - if (error == -ENOSPC) - goto out_trans_cancel; - goto out_trans_abort; - } + if (error) + goto out_trans_cancel; if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); @@ -1381,9 +1383,9 @@ xfs_create_tmpfile( ip->i_d.di_nlink--; error = xfs_iunlink(tp, ip); if (error) - goto out_trans_abort; + goto out_trans_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto out_release_inode; @@ -1394,10 +1396,8 @@ xfs_create_tmpfile( *ipp = ip; return 0; - out_trans_abort: - cancel_flags |= XFS_TRANS_ABORT; out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); out_release_inode: /* * Wait until after the current transaction is aborted to finish the @@ -1427,7 +1427,6 @@ xfs_link( int error; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int cancel_flags; int committed; int resblks; @@ -1447,22 +1446,20 @@ xfs_link( goto std_return; tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; resblks = XFS_LINK_SPACE_RES(mp, target_name->len); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0); if (error == -ENOSPC) { resblks = 0; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0); } - if (error) { - cancel_flags = 0; + if (error) goto error_return; - } + xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); /* * If we are using project inheritance, we only allow hard link @@ -1486,19 +1483,19 @@ xfs_link( if (sip->i_d.di_nlink == 0) { error = xfs_iunlink_remove(tp, sip); if (error) - goto abort_return; + goto error_return; } error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, &first_block, &free_list, resblks); if (error) - goto abort_return; + goto error_return; xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); error = xfs_bumplink(tp, sip); if (error) - goto abort_return; + goto error_return; /* * If this is a synchronous mount, make sure that the @@ -1512,15 +1509,13 @@ xfs_link( error = xfs_bmap_finish (&tp, &free_list, &committed); if (error) { xfs_bmap_cancel(&free_list); - goto abort_return; + goto error_return; } - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + return xfs_trans_commit(tp); - abort_return: - cancel_flags |= XFS_TRANS_ABORT; error_return: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); std_return: return error; } @@ -1555,7 +1550,6 @@ xfs_itruncate_extents( { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; - struct xfs_trans *ntp; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; xfs_fileoff_t first_unmap_block; @@ -1613,29 +1607,7 @@ xfs_itruncate_extents( if (error) goto out_bmap_cancel; - if (committed) { - /* - * Mark the inode dirty so it will be logged and - * moved forward in the log as part of every commit. - */ - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - } - - ntp = xfs_trans_dup(tp); - error = xfs_trans_commit(tp, 0); - tp = ntp; - - xfs_trans_ijoin(tp, ip, 0); - - if (error) - goto out; - - /* - * Transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(tp->t_ticket); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_roll(&tp, ip); if (error) goto out; } @@ -1756,7 +1728,7 @@ xfs_inactive_truncate( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -1777,7 +1749,7 @@ xfs_inactive_truncate( ASSERT(ip->i_d.di_nextents == 0); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto error_unlock; @@ -1785,7 +1757,7 @@ xfs_inactive_truncate( return 0; error_trans_cancel: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); error_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -1835,7 +1807,7 @@ xfs_inactive_ifree( } else { ASSERT(XFS_FORCED_SHUTDOWN(mp)); } - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_trans_cancel(tp); return error; } @@ -1855,7 +1827,7 @@ xfs_inactive_ifree( __func__, error); xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); } - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1866,15 +1838,16 @@ xfs_inactive_ifree( xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); /* - * Just ignore errors at this point. There is nothing we can - * do except to try to keep going. Make sure it's not a silent - * error. + * Just ignore errors at this point. There is nothing we can do except + * to try to keep going. Make sure it's not a silent error. */ error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) + if (error) { xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", __func__, error); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_bmap_cancel(&free_list); + } + error = xfs_trans_commit(tp); if (error) xfs_notice(mp, "%s: xfs_trans_commit returned error %d", __func__, error); @@ -2235,28 +2208,42 @@ xfs_iunlink_remove( */ STATIC int xfs_ifree_cluster( - xfs_inode_t *free_ip, - xfs_trans_t *tp, - xfs_ino_t inum) + xfs_inode_t *free_ip, + xfs_trans_t *tp, + struct xfs_icluster *xic) { xfs_mount_t *mp = free_ip->i_mount; int blks_per_cluster; int inodes_per_cluster; int nbufs; int i, j; + int ioffset; xfs_daddr_t blkno; xfs_buf_t *bp; xfs_inode_t *ip; xfs_inode_log_item_t *iip; xfs_log_item_t *lip; struct xfs_perag *pag; + xfs_ino_t inum; + inum = xic->first_ino; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); blks_per_cluster = xfs_icluster_size_fsb(mp); inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; nbufs = mp->m_ialloc_blks / blks_per_cluster; for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) { + /* + * The allocation bitmap tells us which inodes of the chunk were + * physically allocated. Skip the cluster if an inode falls into + * a sparse region. + */ + ioffset = inum - xic->first_ino; + if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) { + ASSERT(do_mod(ioffset, inodes_per_cluster) == 0); + continue; + } + blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), XFS_INO_TO_AGBNO(mp, inum)); @@ -2414,8 +2401,7 @@ xfs_ifree( xfs_bmap_free_t *flist) { int error; - int delete; - xfs_ino_t first_ino; + struct xfs_icluster xic = { 0 }; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(ip->i_d.di_nlink == 0); @@ -2431,7 +2417,7 @@ xfs_ifree( if (error) return error; - error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); + error = xfs_difree(tp, ip->i_ino, flist, &xic); if (error) return error; @@ -2448,8 +2434,8 @@ xfs_ifree( ip->i_d.di_gen++; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (delete) - error = xfs_ifree_cluster(ip, tp, first_ino); + if (xic.deleted) + error = xfs_ifree_cluster(ip, tp, &xic); return error; } @@ -2536,7 +2522,6 @@ xfs_remove( int error = 0; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int cancel_flags; int committed; uint resblks; @@ -2557,7 +2542,6 @@ xfs_remove( tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); else tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* * We try to get the real space reservation first, @@ -2576,19 +2560,18 @@ xfs_remove( } if (error) { ASSERT(error != -ENOSPC); - cancel_flags = 0; goto out_trans_cancel; } + xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* * If we're removing a directory perform some additional validation. */ - cancel_flags |= XFS_TRANS_ABORT; if (is_dir) { ASSERT(ip->i_d.di_nlink >= 2); if (ip->i_d.di_nlink != 2) { @@ -2644,7 +2627,7 @@ xfs_remove( if (error) goto out_bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto std_return; @@ -2656,7 +2639,7 @@ xfs_remove( out_bmap_cancel: xfs_bmap_cancel(&free_list); out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); std_return: return error; } @@ -2730,11 +2713,11 @@ xfs_finish_rename( error = xfs_bmap_finish(&tp, free_list, &committed); if (error) { xfs_bmap_cancel(free_list); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + return xfs_trans_commit(tp); } /* @@ -2855,7 +2838,7 @@ xfs_cross_rename( out_trans_abort: xfs_bmap_cancel(free_list); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } @@ -2915,7 +2898,6 @@ xfs_rename( int num_inodes = __XFS_SORT_INODES; bool new_parent = (src_dp != target_dp); bool src_is_directory = S_ISDIR(src_ip->i_d.di_mode); - int cancel_flags = 0; int spaceres; int error; @@ -2951,7 +2933,6 @@ xfs_rename( } if (error) goto out_trans_cancel; - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* * Attach the dquots to the inodes @@ -2966,6 +2947,12 @@ xfs_rename( * whether the target directory is the same as the source * directory, we can lock from 2 to 4 inodes. */ + if (!new_parent) + xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); + else + xfs_lock_two_inodes(src_dp, target_dp, + XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); + xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); /* @@ -2973,9 +2960,9 @@ xfs_rename( * we can rely on either trans_commit or trans_cancel to unlock * them. */ - xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); if (new_parent) - xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); if (target_ip) xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); @@ -3022,10 +3009,8 @@ xfs_rename( error = xfs_dir_createname(tp, target_dp, target_name, src_ip->i_ino, &first_block, &free_list, spaceres); - if (error == -ENOSPC) - goto out_bmap_cancel; if (error) - goto out_trans_abort; + goto out_bmap_cancel; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -3033,7 +3018,7 @@ xfs_rename( if (new_parent && src_is_directory) { error = xfs_bumplink(tp, target_dp); if (error) - goto out_trans_abort; + goto out_bmap_cancel; } } else { /* target_ip != NULL */ /* @@ -3065,7 +3050,7 @@ xfs_rename( src_ip->i_ino, &first_block, &free_list, spaceres); if (error) - goto out_trans_abort; + goto out_bmap_cancel; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -3076,7 +3061,7 @@ xfs_rename( */ error = xfs_droplink(tp, target_ip); if (error) - goto out_trans_abort; + goto out_bmap_cancel; if (src_is_directory) { /* @@ -3084,7 +3069,7 @@ xfs_rename( */ error = xfs_droplink(tp, target_ip); if (error) - goto out_trans_abort; + goto out_bmap_cancel; } } /* target_ip != NULL */ @@ -3101,7 +3086,7 @@ xfs_rename( &first_block, &free_list, spaceres); ASSERT(error != -EEXIST); if (error) - goto out_trans_abort; + goto out_bmap_cancel; } /* @@ -3127,7 +3112,7 @@ xfs_rename( */ error = xfs_droplink(tp, src_dp); if (error) - goto out_trans_abort; + goto out_bmap_cancel; } /* @@ -3142,7 +3127,7 @@ xfs_rename( error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, &first_block, &free_list, spaceres); if (error) - goto out_trans_abort; + goto out_bmap_cancel; /* * For whiteouts, we need to bump the link count on the whiteout inode. @@ -3156,10 +3141,10 @@ xfs_rename( ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0); error = xfs_bumplink(tp, wip); if (error) - goto out_trans_abort; + goto out_bmap_cancel; error = xfs_iunlink_remove(tp, wip); if (error) - goto out_trans_abort; + goto out_bmap_cancel; xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); /* @@ -3180,12 +3165,10 @@ xfs_rename( IRELE(wip); return error; -out_trans_abort: - cancel_flags |= XFS_TRANS_ABORT; out_bmap_cancel: xfs_bmap_cancel(&free_list); out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); if (wip) IRELE(wip); return error; @@ -3464,7 +3447,7 @@ xfs_iflush_int( ASSERT(ip->i_d.di_version > 1); /* set *dip = inode's place in the buffer */ - dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); + dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 8f22d20368d8..ca9e11989cbd 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -284,9 +284,9 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) * Flags for lockdep annotations. * * XFS_LOCK_PARENT - for directory operations that require locking a - * parent directory inode and a child entry inode. The parent gets locked - * with this flag so it gets a lockdep subclass of 1 and the child entry - * lock will have a lockdep subclass of 0. + * parent directory inode and a child entry inode. IOLOCK requires nesting, + * MMAPLOCK does not support this class, ILOCK requires a single subclass + * to differentiate parent from child. * * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary * inodes do not participate in the normal lock order, and thus have their @@ -295,30 +295,63 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) * XFS_LOCK_INUMORDER - for locking several inodes at the some time * with xfs_lock_inodes(). This flag is used as the starting subclass * and each subsequent lock acquired will increment the subclass by one. - * So the first lock acquired will have a lockdep subclass of 4, the - * second lock will have a lockdep subclass of 5, and so on. It is - * the responsibility of the class builder to shift this to the correct - * portion of the lock_mode lockdep mask. + * However, MAX_LOCKDEP_SUBCLASSES == 8, which means we are greatly + * limited to the subclasses we can represent via nesting. We need at least + * 5 inodes nest depth for the ILOCK through rename, and we also have to support + * XFS_ILOCK_PARENT, which gives 6 subclasses. Then we have XFS_ILOCK_RTBITMAP + * and XFS_ILOCK_RTSUM, which are another 2 unique subclasses, so that's all + * 8 subclasses supported by lockdep. + * + * This also means we have to number the sub-classes in the lowest bits of + * the mask we keep, and we have to ensure we never exceed 3 bits of lockdep + * mask and we can't use bit-masking to build the subclasses. What a mess. + * + * Bit layout: + * + * Bit Lock Region + * 16-19 XFS_IOLOCK_SHIFT dependencies + * 20-23 XFS_MMAPLOCK_SHIFT dependencies + * 24-31 XFS_ILOCK_SHIFT dependencies + * + * IOLOCK values + * + * 0-3 subclass value + * 4-7 PARENT subclass values + * + * MMAPLOCK values + * + * 0-3 subclass value + * 4-7 unused + * + * ILOCK values + * 0-4 subclass values + * 5 PARENT subclass (not nestable) + * 6 RTBITMAP subclass (not nestable) + * 7 RTSUM subclass (not nestable) + * */ -#define XFS_LOCK_PARENT 1 -#define XFS_LOCK_RTBITMAP 2 -#define XFS_LOCK_RTSUM 3 -#define XFS_LOCK_INUMORDER 4 - -#define XFS_IOLOCK_SHIFT 16 -#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT) - -#define XFS_MMAPLOCK_SHIFT 20 - -#define XFS_ILOCK_SHIFT 24 -#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT) -#define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT) -#define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT) - -#define XFS_IOLOCK_DEP_MASK 0x000f0000 -#define XFS_MMAPLOCK_DEP_MASK 0x00f00000 -#define XFS_ILOCK_DEP_MASK 0xff000000 -#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | \ +#define XFS_IOLOCK_SHIFT 16 +#define XFS_IOLOCK_PARENT_VAL 4 +#define XFS_IOLOCK_MAX_SUBCLASS (XFS_IOLOCK_PARENT_VAL - 1) +#define XFS_IOLOCK_DEP_MASK 0x000f0000 +#define XFS_IOLOCK_PARENT (XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT) + +#define XFS_MMAPLOCK_SHIFT 20 +#define XFS_MMAPLOCK_NUMORDER 0 +#define XFS_MMAPLOCK_MAX_SUBCLASS 3 +#define XFS_MMAPLOCK_DEP_MASK 0x00f00000 + +#define XFS_ILOCK_SHIFT 24 +#define XFS_ILOCK_PARENT_VAL 5 +#define XFS_ILOCK_MAX_SUBCLASS (XFS_ILOCK_PARENT_VAL - 1) +#define XFS_ILOCK_RTBITMAP_VAL 6 +#define XFS_ILOCK_RTSUM_VAL 7 +#define XFS_ILOCK_DEP_MASK 0xff000000 +#define XFS_ILOCK_PARENT (XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT) +#define XFS_ILOCK_RTBITMAP (XFS_ILOCK_RTBITMAP_VAL << XFS_ILOCK_SHIFT) +#define XFS_ILOCK_RTSUM (XFS_ILOCK_RTSUM_VAL << XFS_ILOCK_SHIFT) + +#define XFS_LOCK_SUBCLASS_MASK (XFS_IOLOCK_DEP_MASK | \ XFS_MMAPLOCK_DEP_MASK | \ XFS_ILOCK_DEP_MASK) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index bf13a5a7e2f4..62bd80f4edd9 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -703,17 +703,10 @@ xfs_iflush_abort( xfs_inode_log_item_t *iip = ip->i_itemp; if (iip) { - struct xfs_ail *ailp = iip->ili_item.li_ailp; if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { - spin_lock(&ailp->xa_lock); - if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { - /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, &iip->ili_item, - stale ? - SHUTDOWN_LOG_IO_ERROR : + xfs_trans_ail_remove(&iip->ili_item, + stale ? SHUTDOWN_LOG_IO_ERROR : SHUTDOWN_CORRUPT_INCORE); - } else - spin_unlock(&ailp->xa_lock); } iip->ili_logged = 0; /* diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 87f67c6b654c..ea7d85af5310 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -336,7 +336,7 @@ xfs_set_dmattrs( tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -346,7 +346,7 @@ xfs_set_dmattrs( ip->i_d.di_dmstate = state; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); return error; } @@ -1076,7 +1076,7 @@ xfs_ioctl_setattr_get_trans( return tp; out_cancel: - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return ERR_PTR(error); } @@ -1253,7 +1253,7 @@ xfs_ioctl_setattr( else ip->i_d.di_extsize = 0; - code = xfs_trans_commit(tp, 0); + code = xfs_trans_commit(tp); /* * Release any dquot(s) the inode had kept before chown. @@ -1265,7 +1265,7 @@ xfs_ioctl_setattr( return code; error_trans_cancel: - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); error_free_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(pdqp); @@ -1338,11 +1338,11 @@ xfs_ioc_setxflags( error = xfs_ioctl_setattr_xflags(tp, ip, &fa); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out_drop_write; } - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); out_drop_write: mnt_drop_write_file(filp); return error; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 38e633bad8c2..1f86033171c8 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -183,7 +183,7 @@ xfs_iomap_write_direct( * Check for running out of space, note: need lock to return */ if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -213,7 +213,7 @@ xfs_iomap_write_direct( error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) goto out_bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto out_unlock; @@ -236,7 +236,7 @@ out_bmap_cancel: xfs_bmap_cancel(&free_list); xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); out_trans_cancel: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); goto out_unlock; } @@ -690,7 +690,7 @@ xfs_iomap_write_allocate( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, nres, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -760,7 +760,7 @@ xfs_iomap_write_allocate( if (error) goto trans_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto error0; @@ -791,7 +791,7 @@ xfs_iomap_write_allocate( trans_cancel: xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); error0: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -853,7 +853,7 @@ xfs_iomap_write_unwritten( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -890,7 +890,7 @@ xfs_iomap_write_unwritten( if (error) goto error_on_bmapi_transaction; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; @@ -914,7 +914,7 @@ xfs_iomap_write_unwritten( error_on_bmapi_transaction: xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 7f51f39f8acc..8294132e6a3c 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -609,7 +609,7 @@ xfs_setattr_nonsize( tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (error) - goto out_dqrele; + goto out_trans_cancel; xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -640,7 +640,7 @@ xfs_setattr_nonsize( NULL, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (error) /* out of quota */ - goto out_trans_cancel; + goto out_unlock; } } @@ -699,7 +699,7 @@ xfs_setattr_nonsize( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -729,10 +729,10 @@ xfs_setattr_nonsize( return 0; -out_trans_cancel: - xfs_trans_cancel(tp, 0); +out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); -out_dqrele: +out_trans_cancel: + xfs_trans_cancel(tp); xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); return error; @@ -752,7 +752,6 @@ xfs_setattr_size( struct xfs_trans *tp; int error; uint lock_flags = 0; - uint commit_flags = 0; bool did_zeroing = false; trace_xfs_setattr(ip); @@ -848,7 +847,11 @@ xfs_setattr_size( * to hope that the caller sees ENOMEM and retries the truncate * operation. */ - error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); + if (IS_DAX(inode)) + error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct); + else + error = block_truncate_page(inode->i_mapping, newsize, + xfs_get_blocks); if (error) return error; truncate_setsize(inode, newsize); @@ -858,7 +861,6 @@ xfs_setattr_size( if (error) goto out_trans_cancel; - commit_flags = XFS_TRANS_RELEASE_LOG_RES; lock_flags |= XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); @@ -898,7 +900,7 @@ xfs_setattr_size( if (newsize <= oldsize) { error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); if (error) - goto out_trans_abort; + goto out_trans_cancel; /* * Truncated "down", so we're removing references to old data @@ -925,16 +927,14 @@ xfs_setattr_size( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); out_unlock: if (lock_flags) xfs_iunlock(ip, lock_flags); return error; -out_trans_abort: - commit_flags |= XFS_TRANS_ABORT; out_trans_cancel: - xfs_trans_cancel(tp, commit_flags); + xfs_trans_cancel(tp); goto out_unlock; } @@ -981,7 +981,7 @@ xfs_vn_update_time( tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -1003,7 +1003,7 @@ xfs_vn_update_time( } xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); - return xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp); } #define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) @@ -1188,22 +1188,22 @@ xfs_diflags_to_iflags( struct inode *inode, struct xfs_inode *ip) { - if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) + uint16_t flags = ip->i_d.di_flags; + + inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | + S_NOATIME | S_DAX); + + if (flags & XFS_DIFLAG_IMMUTABLE) inode->i_flags |= S_IMMUTABLE; - else - inode->i_flags &= ~S_IMMUTABLE; - if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) + if (flags & XFS_DIFLAG_APPEND) inode->i_flags |= S_APPEND; - else - inode->i_flags &= ~S_APPEND; - if (ip->i_d.di_flags & XFS_DIFLAG_SYNC) + if (flags & XFS_DIFLAG_SYNC) inode->i_flags |= S_SYNC; - else - inode->i_flags &= ~S_SYNC; - if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME) + if (flags & XFS_DIFLAG_NOATIME) inode->i_flags |= S_NOATIME; - else - inode->i_flags &= ~S_NOATIME; + /* XXX: Also needs an on-disk per inode flag! */ + if (ip->i_mount->m_flags & XFS_MOUNT_DAX) + inode->i_flags |= S_DAX; } /* diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 80429891dc9b..930ebd86beba 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -252,7 +252,7 @@ xfs_bulkstat_grab_ichunk( } irec->ir_free |= xfs_inobt_maskn(0, idx); - *icount = XFS_INODES_PER_CHUNK - irec->ir_freecount; + *icount = irec->ir_count - irec->ir_freecount; } return 0; @@ -415,6 +415,8 @@ xfs_bulkstat( goto del_cursor; if (icount) { irbp->ir_startino = r.ir_startino; + irbp->ir_holemask = r.ir_holemask; + irbp->ir_count = r.ir_count; irbp->ir_freecount = r.ir_freecount; irbp->ir_free = r.ir_free; irbp++; @@ -447,13 +449,15 @@ xfs_bulkstat( * If this chunk has any allocated inodes, save it. * Also start read-ahead now for this chunk. */ - if (r.ir_freecount < XFS_INODES_PER_CHUNK) { + if (r.ir_freecount < r.ir_count) { xfs_bulkstat_ichunk_ra(mp, agno, &r); irbp->ir_startino = r.ir_startino; + irbp->ir_holemask = r.ir_holemask; + irbp->ir_count = r.ir_count; irbp->ir_freecount = r.ir_freecount; irbp->ir_free = r.ir_free; irbp++; - icount += XFS_INODES_PER_CHUNK - r.ir_freecount; + icount += r.ir_count - r.ir_freecount; } error = xfs_btree_increment(cur, 0, &stat); if (error || stat == 0) { @@ -469,7 +473,8 @@ xfs_bulkstat( * pending error, then we are done. */ del_cursor: - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, error ? + XFS_BTREE_ERROR : XFS_BTREE_NOERROR); xfs_buf_relse(agbp); if (error) break; @@ -599,8 +604,7 @@ xfs_inumbers( agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1; buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, r.ir_startino); - buffer[bufidx].xi_alloccount = - XFS_INODES_PER_CHUNK - r.ir_freecount; + buffer[bufidx].xi_alloccount = r.ir_count - r.ir_freecount; buffer[bufidx].xi_allocmask = ~r.ir_free; if (++bufidx == bcount) { long written; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 7c7842c85a08..85f883dd6207 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -32,26 +32,12 @@ typedef unsigned int __uint32_t; typedef signed long long int __int64_t; typedef unsigned long long int __uint64_t; -typedef __uint32_t inst_t; /* an instruction */ - typedef __s64 xfs_off_t; /* <file offset> type */ typedef unsigned long long xfs_ino_t; /* <inode> type */ typedef __s64 xfs_daddr_t; /* <disk address> type */ -typedef char * xfs_caddr_t; /* <core address> type */ typedef __u32 xfs_dev_t; typedef __u32 xfs_nlink_t; -/* __psint_t is the same size as a pointer */ -#if (BITS_PER_LONG == 32) -typedef __int32_t __psint_t; -typedef __uint32_t __psunsigned_t; -#elif (BITS_PER_LONG == 64) -typedef __int64_t __psint_t; -typedef __uint64_t __psunsigned_t; -#else -#error BITS_PER_LONG must be 32 or 64 -#endif - #include "xfs_types.h" #include "kmem.h" diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index bcc7cfabb787..aaadee0969c9 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -109,7 +109,7 @@ xlog_ungrant_log_space( STATIC void xlog_verify_dest_ptr( struct xlog *log, - char *ptr); + void *ptr); STATIC void xlog_verify_grant_tail( struct xlog *log); @@ -513,7 +513,7 @@ xfs_log_done( struct xfs_mount *mp, struct xlog_ticket *ticket, struct xlog_in_core **iclog, - uint flags) + bool regrant) { struct xlog *log = mp->m_log; xfs_lsn_t lsn = 0; @@ -526,14 +526,11 @@ xfs_log_done( (((ticket->t_flags & XLOG_TIC_INITED) == 0) && (xlog_commit_record(log, ticket, iclog, &lsn)))) { lsn = (xfs_lsn_t) -1; - if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { - flags |= XFS_LOG_REL_PERM_RESERV; - } + regrant = false; } - if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 || - (flags & XFS_LOG_REL_PERM_RESERV)) { + if (!regrant) { trace_xfs_log_done_nonperm(log, ticket); /* @@ -541,7 +538,6 @@ xfs_log_done( * request has been made to release a permanent reservation. */ xlog_ungrant_log_space(log, ticket); - xfs_log_ticket_put(ticket); } else { trace_xfs_log_done_perm(log, ticket); @@ -553,6 +549,7 @@ xfs_log_done( ticket->t_flags |= XLOG_TIC_INITED; } + xfs_log_ticket_put(ticket); return lsn; } @@ -671,9 +668,9 @@ xfs_log_mount( ASSERT(0); goto out_free_log; } + xfs_crit(mp, "Log size out of supported range."); xfs_crit(mp, -"Log size out of supported range. Continuing onwards, but if log hangs are\n" -"experienced then please report this message in the bug report."); +"Continuing onwards, but if log hangs are experienced then please report this message in the bug report."); } /* @@ -703,6 +700,7 @@ xfs_log_mount( if (error) { xfs_warn(mp, "log mount/recovery failed: error %d", error); + xlog_recover_cancel(mp->m_log); goto out_destroy_ail; } } @@ -743,18 +741,35 @@ out: * it. */ int -xfs_log_mount_finish(xfs_mount_t *mp) +xfs_log_mount_finish( + struct xfs_mount *mp) { int error = 0; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { - error = xlog_recover_finish(mp->m_log); - if (!error) - xfs_log_work_queue(mp); - } else { + if (mp->m_flags & XFS_MOUNT_NORECOVERY) { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + return 0; } + error = xlog_recover_finish(mp->m_log); + if (!error) + xfs_log_work_queue(mp); + + return error; +} + +/* + * The mount has failed. Cancel the recovery if it hasn't completed and destroy + * the log. + */ +int +xfs_log_mount_cancel( + struct xfs_mount *mp) +{ + int error; + + error = xlog_recover_cancel(mp->m_log); + xfs_log_unmount(mp); return error; } @@ -1145,11 +1160,13 @@ xlog_space_left( * In this case we just want to return the size of the * log as the amount of space left. */ + xfs_alert(log->l_mp, "xlog_space_left: head behind tail"); xfs_alert(log->l_mp, - "xlog_space_left: head behind tail\n" - " tail_cycle = %d, tail_bytes = %d\n" - " GH cycle = %d, GH bytes = %d", - tail_cycle, tail_bytes, head_cycle, head_bytes); + " tail_cycle = %d, tail_bytes = %d", + tail_cycle, tail_bytes); + xfs_alert(log->l_mp, + " GH cycle = %d, GH bytes = %d", + head_cycle, head_bytes); ASSERT(0); free_bytes = log->l_logsize; } @@ -1447,7 +1464,7 @@ xlog_alloc_log( iclog->ic_bp = bp; iclog->ic_data = bp->b_addr; #ifdef DEBUG - log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); + log->l_iclog_bak[i] = &iclog->ic_header; #endif head = &iclog->ic_header; memset(head, 0, sizeof(xlog_rec_header_t)); @@ -1602,7 +1619,7 @@ xlog_pack_data( int i, j, k; int size = iclog->ic_offset + roundoff; __be32 cycle_lsn; - xfs_caddr_t dp; + char *dp; cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); @@ -1655,8 +1672,13 @@ xlog_cksum( if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; int i; + int xheads; + + xheads = size / XLOG_HEADER_CYCLE_SIZE; + if (size % XLOG_HEADER_CYCLE_SIZE) + xheads++; - for (i = 1; i < log->l_iclog_heads; i++) { + for (i = 1; i < xheads; i++) { crc = crc32c(crc, &xhdr[i].hic_xheader, sizeof(struct xlog_rec_ext_header)); } @@ -2031,26 +2053,24 @@ xlog_print_tic_res( "SWAPEXT" }; - xfs_warn(mp, - "xlog_write: reservation summary:\n" - " trans type = %s (%u)\n" - " unit res = %d bytes\n" - " current res = %d bytes\n" - " total reg = %u bytes (o/flow = %u bytes)\n" - " ophdrs = %u (ophdr space = %u bytes)\n" - " ophdr + reg = %u bytes\n" - " num regions = %u", - ((ticket->t_trans_type <= 0 || - ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? + xfs_warn(mp, "xlog_write: reservation summary:"); + xfs_warn(mp, " trans type = %s (%u)", + ((ticket->t_trans_type <= 0 || + ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]), - ticket->t_trans_type, - ticket->t_unit_res, - ticket->t_curr_res, - ticket->t_res_arr_sum, ticket->t_res_o_flow, - ticket->t_res_num_ophdrs, ophdr_spc, - ticket->t_res_arr_sum + - ticket->t_res_o_flow + ophdr_spc, - ticket->t_res_num); + ticket->t_trans_type); + xfs_warn(mp, " unit res = %d bytes", + ticket->t_unit_res); + xfs_warn(mp, " current res = %d bytes", + ticket->t_curr_res); + xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)", + ticket->t_res_arr_sum, ticket->t_res_o_flow); + xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)", + ticket->t_res_num_ophdrs, ophdr_spc); + xfs_warn(mp, " ophdr + reg = %u bytes", + ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc); + xfs_warn(mp, " num regions = %u", + ticket->t_res_num); for (i = 0; i < ticket->t_res_num; i++) { uint r_type = ticket->t_res_arr[i].r_type; @@ -3664,7 +3684,7 @@ xlog_ticket_alloc( void xlog_verify_dest_ptr( struct xlog *log, - char *ptr) + void *ptr) { int i; int good_ptr = 0; @@ -3767,9 +3787,8 @@ xlog_verify_iclog( xlog_op_header_t *ophead; xlog_in_core_t *icptr; xlog_in_core_2_t *xhdr; - xfs_caddr_t ptr; - xfs_caddr_t base_ptr; - __psint_t field_offset; + void *base_ptr, *ptr, *p; + ptrdiff_t field_offset; __uint8_t clientid; int len, i, j, k, op_len; int idx; @@ -3788,9 +3807,9 @@ xlog_verify_iclog( if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: invalid magic num", __func__); - ptr = (xfs_caddr_t) &iclog->ic_header; - for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count; - ptr += BBSIZE) { + base_ptr = ptr = &iclog->ic_header; + p = &iclog->ic_header; + for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) { if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: unexpected magic num", __func__); @@ -3798,20 +3817,19 @@ xlog_verify_iclog( /* check fields */ len = be32_to_cpu(iclog->ic_header.h_num_logops); - ptr = iclog->ic_datap; - base_ptr = ptr; - ophead = (xlog_op_header_t *)ptr; + base_ptr = ptr = iclog->ic_datap; + ophead = ptr; xhdr = iclog->ic_data; for (i = 0; i < len; i++) { - ophead = (xlog_op_header_t *)ptr; + ophead = ptr; /* clientid is only 1 byte */ - field_offset = (__psint_t) - ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr); + p = &ophead->oh_clientid; + field_offset = p - base_ptr; if (!syncing || (field_offset & 0x1ff)) { clientid = ophead->oh_clientid; } else { - idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap); + idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); @@ -3829,13 +3847,13 @@ xlog_verify_iclog( (unsigned long)field_offset); /* check length */ - field_offset = (__psint_t) - ((xfs_caddr_t)&(ophead->oh_len) - base_ptr); + p = &ophead->oh_len; + field_offset = p - base_ptr; if (!syncing || (field_offset & 0x1ff)) { op_len = be32_to_cpu(ophead->oh_len); } else { - idx = BTOBBT((__psint_t)&ophead->oh_len - - (__psint_t)iclog->ic_datap); + idx = BTOBBT((uintptr_t)&ophead->oh_len - + (uintptr_t)iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 84e0deb95abd..09d91d3166cd 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -111,15 +111,6 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) #define XFS_LSN_CMP(x,y) _lsn_cmp(x,y) /* - * Macros, structures, prototypes for interface to the log manager. - */ - -/* - * Flags to xfs_log_done() - */ -#define XFS_LOG_REL_PERM_RESERV 0x1 - -/* * Flags to xfs_log_force() * * XFS_LOG_SYNC: Synchronous force in-core log to disk @@ -138,7 +129,7 @@ struct xfs_log_callback; xfs_lsn_t xfs_log_done(struct xfs_mount *mp, struct xlog_ticket *ticket, struct xlog_in_core **iclog, - uint flags); + bool regrant); int _xfs_log_force(struct xfs_mount *mp, uint flags, int *log_forced); @@ -156,6 +147,7 @@ int xfs_log_mount(struct xfs_mount *mp, xfs_daddr_t start_block, int num_bblocks); int xfs_log_mount_finish(struct xfs_mount *mp); +int xfs_log_mount_cancel(struct xfs_mount *); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); @@ -183,7 +175,7 @@ struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_lsn_t *commit_lsn, int flags); + xfs_lsn_t *commit_lsn, bool regrant); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 45cc0ce18adf..4e7649351f5a 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -307,7 +307,13 @@ xlog_cil_insert_items( if (!(lidp->lid_flags & XFS_LID_DIRTY)) continue; - list_move_tail(&lip->li_cil, &cil->xc_cil); + /* + * Only move the item if it isn't already at the tail. This is + * to prevent a transient list_empty() state when reinserting + * an item that is already the only item in the CIL. + */ + if (!list_is_last(&lip->li_cil, &cil->xc_cil)) + list_move_tail(&lip->li_cil, &cil->xc_cil); } /* account for space used by new iovec headers */ @@ -624,7 +630,7 @@ restart: spin_unlock(&cil->xc_push_lock); /* xfs_log_done always frees the ticket on error. */ - commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); + commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false); if (commit_lsn == -1) goto out_abort; @@ -773,14 +779,10 @@ xfs_log_commit_cil( struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, - int flags) + bool regrant) { struct xlog *log = mp->m_log; struct xfs_cil *cil = log->l_cilp; - int log_flags = 0; - - if (flags & XFS_TRANS_RELEASE_LOG_RES) - log_flags = XFS_LOG_REL_PERM_RESERV; /* lock out background commit */ down_read(&cil->xc_ctx_lock); @@ -795,7 +797,7 @@ xfs_log_commit_cil( if (commit_lsn) *commit_lsn = tp->t_commit_lsn; - xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + xfs_log_done(mp, tp->t_ticket, NULL, regrant); xfs_trans_unreserve_and_mod_sb(tp); /* @@ -809,7 +811,7 @@ xfs_log_commit_cil( * the log items. This affects (at least) processing of stale buffers, * inodes and EFIs. */ - xfs_trans_free_items(tp, tp->t_commit_lsn, 0); + xfs_trans_free_items(tp, tp->t_commit_lsn, false); xlog_cil_push_background(log); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index db7cbdeb2b42..950f3f94720c 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -409,7 +409,7 @@ struct xlog { /* The following field are used for debugging; need to hold icloglock */ #ifdef DEBUG - char *l_iclog_bak[XLOG_MAX_ICLOGS]; + void *l_iclog_bak[XLOG_MAX_ICLOGS]; #endif }; @@ -426,6 +426,8 @@ xlog_recover( extern int xlog_recover_finish( struct xlog *log); +extern int +xlog_recover_cancel(struct xlog *); extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, char *dp, int size); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 4f5784f85a5b..512a0945d52a 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -147,7 +147,7 @@ xlog_put_bp( * Return the address of the start of the given block number's data * in a log buffer. The buffer covers a log sector-aligned region. */ -STATIC xfs_caddr_t +STATIC char * xlog_align( struct xlog *log, xfs_daddr_t blk_no, @@ -203,7 +203,7 @@ xlog_bread( xfs_daddr_t blk_no, int nbblks, struct xfs_buf *bp, - xfs_caddr_t *offset) + char **offset) { int error; @@ -225,9 +225,9 @@ xlog_bread_offset( xfs_daddr_t blk_no, /* block to read from */ int nbblks, /* blocks to read */ struct xfs_buf *bp, - xfs_caddr_t offset) + char *offset) { - xfs_caddr_t orig_offset = bp->b_addr; + char *orig_offset = bp->b_addr; int orig_len = BBTOB(bp->b_length); int error, error2; @@ -396,7 +396,7 @@ xlog_find_cycle_start( xfs_daddr_t *last_blk, uint cycle) { - xfs_caddr_t offset; + char *offset; xfs_daddr_t mid_blk; xfs_daddr_t end_blk; uint mid_cycle; @@ -443,7 +443,7 @@ xlog_find_verify_cycle( uint cycle; xfs_buf_t *bp; xfs_daddr_t bufblks; - xfs_caddr_t buf = NULL; + char *buf = NULL; int error = 0; /* @@ -509,7 +509,7 @@ xlog_find_verify_log_record( { xfs_daddr_t i; xfs_buf_t *bp; - xfs_caddr_t offset = NULL; + char *offset = NULL; xlog_rec_header_t *head = NULL; int error = 0; int smallmem = 0; @@ -616,7 +616,7 @@ xlog_find_head( xfs_daddr_t *return_head_blk) { xfs_buf_t *bp; - xfs_caddr_t offset; + char *offset; xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk; int num_scan_bblks; uint first_half_cycle, last_half_cycle; @@ -891,7 +891,7 @@ xlog_find_tail( { xlog_rec_header_t *rhead; xlog_op_header_t *op_head; - xfs_caddr_t offset = NULL; + char *offset = NULL; xfs_buf_t *bp; int error, i, found; xfs_daddr_t umount_data_blk; @@ -1099,7 +1099,7 @@ xlog_find_zeroed( xfs_daddr_t *blk_no) { xfs_buf_t *bp; - xfs_caddr_t offset; + char *offset; uint first_cycle, last_cycle; xfs_daddr_t new_blk, last_blk, start_blk; xfs_daddr_t num_scan_bblks; @@ -1199,7 +1199,7 @@ bp_err: STATIC void xlog_add_record( struct xlog *log, - xfs_caddr_t buf, + char *buf, int cycle, int block, int tail_cycle, @@ -1227,7 +1227,7 @@ xlog_write_log_records( int tail_cycle, int tail_block) { - xfs_caddr_t offset; + char *offset; xfs_buf_t *bp; int balign, ealign; int sectbb = log->l_sectBBsize; @@ -1789,8 +1789,7 @@ xlog_recover_do_inode_buffer( return -EFSCORRUPTED; } - buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, - next_unlinked_offset); + buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset); *buffer_nextp = *logged_nextp; /* @@ -1798,7 +1797,7 @@ xlog_recover_do_inode_buffer( * have to leave the inode in a consistent state for whoever * reads it next.... */ - xfs_dinode_calc_crc(mp, (struct xfs_dinode *) + xfs_dinode_calc_crc(mp, xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); } @@ -1887,19 +1886,34 @@ xlog_recover_get_buf_lsn( uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; break; case XFS_ATTR3_RMT_MAGIC: - lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn); - uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid; - break; + /* + * Remote attr blocks are written synchronously, rather than + * being logged. That means they do not contain a valid LSN + * (i.e. transactionally ordered) in them, and hence any time we + * see a buffer to replay over the top of a remote attribute + * block we should simply do so. + */ + goto recover_immediately; case XFS_SB_MAGIC: + /* + * superblock uuids are magic. We may or may not have a + * sb_meta_uuid on disk, but it will be set in the in-core + * superblock. We set the uuid pointer for verification + * according to the superblock feature mask to ensure we check + * the relevant UUID in the superblock. + */ lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); - uuid = &((struct xfs_dsb *)blk)->sb_uuid; + if (xfs_sb_version_hasmetauuid(&mp->m_sb)) + uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid; + else + uuid = &((struct xfs_dsb *)blk)->sb_uuid; break; default: break; } if (lsn != (xfs_lsn_t)-1) { - if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) goto recover_immediately; return lsn; } @@ -2503,8 +2517,8 @@ xlog_recover_inode_pass2( xfs_buf_t *bp; xfs_dinode_t *dip; int len; - xfs_caddr_t src; - xfs_caddr_t dest; + char *src; + char *dest; int error; int attr_index; uint fields; @@ -2546,7 +2560,7 @@ xlog_recover_inode_pass2( goto out_release; } ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); - dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); + dip = xfs_buf_offset(bp, in_f->ilf_boffset); /* * Make sure the place we're flushing out to really looks @@ -2885,7 +2899,7 @@ xlog_recover_dquot_pass2( return error; ASSERT(bp); - ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); + ddq = xfs_buf_offset(bp, dq_f->qlf_boffset); /* * If the dquot has an LSN in it, recover the dquot only if it's less @@ -2929,16 +2943,16 @@ xlog_recover_efi_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; - xfs_mount_t *mp = log->l_mp; - xfs_efi_log_item_t *efip; - xfs_efi_log_format_t *efi_formatp; + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_efi_log_item *efip; + struct xfs_efi_log_format *efi_formatp; efi_formatp = item->ri_buf[0].i_addr; efip = xfs_efi_init(mp, efi_formatp->efi_nextents); - if ((error = xfs_efi_copy_format(&(item->ri_buf[0]), - &(efip->efi_format)))) { + error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format); + if (error) { xfs_efi_item_free(efip); return error; } @@ -2946,20 +2960,23 @@ xlog_recover_efi_pass2( spin_lock(&log->l_ailp->xa_lock); /* - * xfs_trans_ail_update() drops the AIL lock. + * The EFI has two references. One for the EFD and one for EFI to ensure + * it makes it into the AIL. Insert the EFI into the AIL directly and + * drop the EFI reference. Note that xfs_trans_ail_update() drops the + * AIL lock. */ xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn); + xfs_efi_release(efip); return 0; } /* - * This routine is called when an efd format structure is found in - * a committed transaction in the log. It's purpose is to cancel - * the corresponding efi if it was still in the log. To do this - * it searches the AIL for the efi with an id equal to that in the - * efd format structure. If we find it, we remove the efi from the - * AIL and free it. + * This routine is called when an EFD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding EFI if it + * was still in the log. To do this it searches the AIL for the EFI with an id + * equal to that in the EFD format structure. If we find it we drop the EFD + * reference, which removes the EFI from the AIL and frees it. */ STATIC int xlog_recover_efd_pass2( @@ -2981,8 +2998,8 @@ xlog_recover_efd_pass2( efi_id = efd_formatp->efd_efi_id; /* - * Search for the efi with the id in the efd format structure - * in the AIL. + * Search for the EFI with the id in the EFD format structure in the + * AIL. */ spin_lock(&ailp->xa_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); @@ -2991,18 +3008,18 @@ xlog_recover_efd_pass2( efip = (xfs_efi_log_item_t *)lip; if (efip->efi_format.efi_id == efi_id) { /* - * xfs_trans_ail_delete() drops the - * AIL lock. + * Drop the EFD reference to the EFI. This + * removes the EFI from the AIL and frees it. */ - xfs_trans_ail_delete(ailp, lip, - SHUTDOWN_CORRUPT_INCORE); - xfs_efi_item_free(efip); + spin_unlock(&ailp->xa_lock); + xfs_efi_release(efip); spin_lock(&ailp->xa_lock); break; } } lip = xfs_trans_ail_cursor_next(ailp, &cur); } + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); @@ -3030,6 +3047,11 @@ xlog_recover_do_icreate_pass2( unsigned int count; unsigned int isize; xfs_agblock_t length; + int blks_per_cluster; + int bb_per_cluster; + int cancel_count; + int nbufs; + int i; icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; if (icl->icl_type != XFS_LI_ICREATE) { @@ -3068,32 +3090,65 @@ xlog_recover_do_icreate_pass2( return -EINVAL; } - /* existing allocation is fixed value */ - ASSERT(count == mp->m_ialloc_inos); - ASSERT(length == mp->m_ialloc_blks); - if (count != mp->m_ialloc_inos || - length != mp->m_ialloc_blks) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); + /* + * The inode chunk is either full or sparse and we only support + * m_ialloc_min_blks sized sparse allocations at this time. + */ + if (length != mp->m_ialloc_blks && + length != mp->m_ialloc_min_blks) { + xfs_warn(log->l_mp, + "%s: unsupported chunk length", __FUNCTION__); return -EINVAL; } + /* verify inode count is consistent with extent length */ + if ((count >> mp->m_sb.sb_inopblog) != length) { + xfs_warn(log->l_mp, + "%s: inconsistent inode count and chunk length", + __FUNCTION__); + return -EINVAL; + } + + /* + * The icreate transaction can cover multiple cluster buffers and these + * buffers could have been freed and reused. Check the individual + * buffers for cancellation so we don't overwrite anything written after + * a cancellation. + */ + blks_per_cluster = xfs_icluster_size_fsb(mp); + bb_per_cluster = XFS_FSB_TO_BB(mp, blks_per_cluster); + nbufs = length / blks_per_cluster; + for (i = 0, cancel_count = 0; i < nbufs; i++) { + xfs_daddr_t daddr; + + daddr = XFS_AGB_TO_DADDR(mp, agno, + agbno + i * blks_per_cluster); + if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0)) + cancel_count++; + } + /* - * Inode buffers can be freed. Do not replay the inode initialisation as - * we could be overwriting something written after this inode buffer was - * cancelled. + * We currently only use icreate for a single allocation at a time. This + * means we should expect either all or none of the buffers to be + * cancelled. Be conservative and skip replay if at least one buffer is + * cancelled, but warn the user that something is awry if the buffers + * are not consistent. * - * XXX: we need to iterate all buffers and only init those that are not - * cancelled. I think that a more fine grained factoring of - * xfs_ialloc_inode_init may be appropriate here to enable this to be - * done easily. + * XXX: This must be refined to only skip cancelled clusters once we use + * icreate for multiple chunk allocations. */ - if (xlog_check_buffer_cancelled(log, - XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) + ASSERT(!cancel_count || cancel_count == nbufs); + if (cancel_count) { + if (cancel_count != nbufs) + xfs_warn(mp, + "WARNING: partial inode chunk cancellation, skipped icreate."); + trace_xfs_log_recover_icreate_cancel(log, icl); return 0; + } - xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length, - be32_to_cpu(icl->icl_gen)); - return 0; + trace_xfs_log_recover_icreate_recover(log, icl); + return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, + length, be32_to_cpu(icl->icl_gen)); } STATIC void @@ -3364,21 +3419,31 @@ STATIC int xlog_recover_add_to_cont_trans( struct xlog *log, struct xlog_recover *trans, - xfs_caddr_t dp, + char *dp, int len) { xlog_recover_item_t *item; - xfs_caddr_t ptr, old_ptr; + char *ptr, *old_ptr; int old_len; + /* + * If the transaction is empty, the header was split across this and the + * previous record. Copy the rest of the header. + */ if (list_empty(&trans->r_itemq)) { - /* finish copying rest of trans header */ + ASSERT(len < sizeof(struct xfs_trans_header)); + if (len > sizeof(struct xfs_trans_header)) { + xfs_warn(log->l_mp, "%s: bad header length", __func__); + return -EIO; + } + xlog_recover_add_item(&trans->r_itemq); - ptr = (xfs_caddr_t) &trans->r_theader + - sizeof(xfs_trans_header_t) - len; + ptr = (char *)&trans->r_theader + + sizeof(struct xfs_trans_header) - len; memcpy(ptr, dp, len); return 0; } + /* take the tail entry */ item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); @@ -3410,12 +3475,12 @@ STATIC int xlog_recover_add_to_trans( struct xlog *log, struct xlog_recover *trans, - xfs_caddr_t dp, + char *dp, int len) { xfs_inode_log_format_t *in_f; /* any will do */ xlog_recover_item_t *item; - xfs_caddr_t ptr; + char *ptr; if (!len) return 0; @@ -3427,7 +3492,19 @@ xlog_recover_add_to_trans( ASSERT(0); return -EIO; } - if (len == sizeof(xfs_trans_header_t)) + + if (len > sizeof(struct xfs_trans_header)) { + xfs_warn(log->l_mp, "%s: bad header length", __func__); + ASSERT(0); + return -EIO; + } + + /* + * The transaction header can be arbitrarily split across op + * records. If we don't have the whole thing here, copy what we + * do have and handle the rest in the next record. + */ + if (len == sizeof(struct xfs_trans_header)) xlog_recover_add_item(&trans->r_itemq); memcpy(&trans->r_theader, dp, len); return 0; @@ -3504,7 +3581,7 @@ STATIC int xlog_recovery_process_trans( struct xlog *log, struct xlog_recover *trans, - xfs_caddr_t dp, + char *dp, unsigned int len, unsigned int flags, int pass) @@ -3611,8 +3688,8 @@ xlog_recover_process_ophdr( struct hlist_head rhash[], struct xlog_rec_header *rhead, struct xlog_op_header *ohead, - xfs_caddr_t dp, - xfs_caddr_t end, + char *dp, + char *end, int pass) { struct xlog_recover *trans; @@ -3661,11 +3738,11 @@ xlog_recover_process_data( struct xlog *log, struct hlist_head rhash[], struct xlog_rec_header *rhead, - xfs_caddr_t dp, + char *dp, int pass) { struct xlog_op_header *ohead; - xfs_caddr_t end; + char *end; int num_logops; int error; @@ -3730,7 +3807,7 @@ xlog_recover_process_efi( * free the memory associated with it. */ set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); - xfs_efi_release(efip, efip->efi_format.efi_nextents); + xfs_efi_release(efip); return -EIO; } } @@ -3743,19 +3820,19 @@ xlog_recover_process_efi( for (i = 0; i < efip->efi_format.efi_nextents; i++) { extp = &(efip->efi_format.efi_extents[i]); - error = xfs_free_extent(tp, extp->ext_start, extp->ext_len); + error = xfs_trans_free_extent(tp, efdp, extp->ext_start, + extp->ext_len); if (error) goto abort_error; - xfs_trans_log_efd_extent(tp, efdp, extp->ext_start, - extp->ext_len); + } set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); return error; abort_error: - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } @@ -3779,10 +3856,10 @@ abort_error: */ STATIC int xlog_recover_process_efis( - struct xlog *log) + struct xlog *log) { - xfs_log_item_t *lip; - xfs_efi_log_item_t *efip; + struct xfs_log_item *lip; + struct xfs_efi_log_item *efip; int error = 0; struct xfs_ail_cursor cur; struct xfs_ail *ailp; @@ -3806,7 +3883,7 @@ xlog_recover_process_efis( /* * Skip EFIs that we've already processed. */ - efip = (xfs_efi_log_item_t *)lip; + efip = container_of(lip, struct xfs_efi_log_item, efi_item); if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) { lip = xfs_trans_ail_cursor_next(ailp, &cur); continue; @@ -3826,6 +3903,50 @@ out: } /* + * A cancel occurs when the mount has failed and we're bailing out. Release all + * pending EFIs so they don't pin the AIL. + */ +STATIC int +xlog_recover_cancel_efis( + struct xlog *log) +{ + struct xfs_log_item *lip; + struct xfs_efi_log_item *efip; + int error = 0; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp; + + ailp = log->l_ailp; + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + while (lip != NULL) { + /* + * We're done when we see something other than an EFI. + * There should be no EFIs left in the AIL now. + */ + if (lip->li_type != XFS_LI_EFI) { +#ifdef DEBUG + for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) + ASSERT(lip->li_type != XFS_LI_EFI); +#endif + break; + } + + efip = container_of(lip, struct xfs_efi_log_item, efi_item); + + spin_unlock(&ailp->xa_lock); + xfs_efi_release(efip); + spin_lock(&ailp->xa_lock); + + lip = xfs_trans_ail_cursor_next(ailp, &cur); + } + + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + return error; +} + +/* * This routine performs a transaction to null out a bad inode pointer * in an agi unlinked inode hash bucket. */ @@ -3857,13 +3978,13 @@ xlog_recover_clear_agi_bucket( xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) goto out_error; return; out_abort: - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + xfs_trans_cancel(tp); out_error: xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno); return; @@ -4010,7 +4131,7 @@ xlog_recover_process_iunlinks( STATIC int xlog_unpack_data_crc( struct xlog_rec_header *rhead, - xfs_caddr_t dp, + char *dp, struct xlog *log) { __le32 crc; @@ -4040,7 +4161,7 @@ xlog_unpack_data_crc( STATIC int xlog_unpack_data( struct xlog_rec_header *rhead, - xfs_caddr_t dp, + char *dp, struct xlog *log) { int i, j, k; @@ -4122,7 +4243,7 @@ xlog_do_recovery_pass( { xlog_rec_header_t *rhead; xfs_daddr_t blk_no; - xfs_caddr_t offset; + char *offset; xfs_buf_t *hbp, *dbp; int error = 0, h_size; int bblks, split_bblks; @@ -4518,11 +4639,13 @@ xlog_recover( xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb, XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) { xfs_warn(log->l_mp, -"Superblock has unknown incompatible log features (0x%x) enabled.\n" -"The log can not be fully and/or safely recovered by this kernel.\n" -"Please recover the log on a kernel that supports the unknown features.", +"Superblock has unknown incompatible log features (0x%x) enabled.", (log->l_mp->m_sb.sb_features_log_incompat & XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); + xfs_warn(log->l_mp, +"The log can not be fully and/or safely recovered by this kernel."); + xfs_warn(log->l_mp, +"Please recover the log on a kernel that supports the unknown features."); return -EINVAL; } @@ -4598,6 +4721,17 @@ xlog_recover_finish( return 0; } +int +xlog_recover_cancel( + struct xlog *log) +{ + int error = 0; + + if (log->l_flags & XLOG_RECOVERY_NEEDED) + error = xlog_recover_cancel_efis(log); + + return error; +} #if defined(DEBUG) /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 6f23fbdfb365..bf92e0c037c7 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -615,14 +615,14 @@ xfs_default_resblks(xfs_mount_t *mp) */ int xfs_mountfs( - xfs_mount_t *mp) + struct xfs_mount *mp) { - xfs_sb_t *sbp = &(mp->m_sb); - xfs_inode_t *rip; - __uint64_t resblks; - uint quotamount = 0; - uint quotaflags = 0; - int error = 0; + struct xfs_sb *sbp = &(mp->m_sb); + struct xfs_inode *rip; + __uint64_t resblks; + uint quotamount = 0; + uint quotaflags = 0; + int error = 0; xfs_sb_mount_common(mp, sbp); @@ -725,6 +725,22 @@ xfs_mountfs( } /* + * If enabled, sparse inode chunk alignment is expected to match the + * cluster size. Full inode chunk alignment must match the chunk size, + * but that is checked on sb read verification... + */ + if (xfs_sb_version_hassparseinodes(&mp->m_sb) && + mp->m_sb.sb_spino_align != + XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) { + xfs_warn(mp, + "Sparse inode block alignment (%u) must match cluster size (%llu).", + mp->m_sb.sb_spino_align, + XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)); + error = -EINVAL; + goto out_remove_uuid; + } + + /* * Set inode alignment fields */ xfs_set_inoalignment(mp); @@ -783,7 +799,9 @@ xfs_mountfs( } /* - * log's mount-time initialization. Perform 1st part recovery if needed + * Log's mount-time initialization. The first part of recovery can place + * some items on the AIL, to be handled when recovery is finished or + * cancelled. */ error = xfs_log_mount(mp, mp->m_logdev_targp, XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), @@ -894,9 +912,9 @@ xfs_mountfs( } /* - * Finish recovering the file system. This part needed to be - * delayed until after the root and real-time bitmap inodes - * were consistently read in. + * Finish recovering the file system. This part needed to be delayed + * until after the root and real-time bitmap inodes were consistently + * read in. */ error = xfs_log_mount_finish(mp); if (error) { @@ -939,8 +957,10 @@ xfs_mountfs( xfs_rtunmount_inodes(mp); out_rele_rip: IRELE(rip); + cancel_delayed_work_sync(&mp->m_reclaim_work); + xfs_reclaim_inodes(mp, SYNC_WAIT); out_log_dealloc: - xfs_log_unmount(mp); + xfs_log_mount_cancel(mp); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_wait_buftarg(mp->m_logdev_targp); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 8c995a2ccb6f..7999e91cd49a 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -101,6 +101,8 @@ typedef struct xfs_mount { __uint64_t m_flags; /* global mount flags */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ + int m_ialloc_min_blks;/* min blocks in sparse inode + * allocation */ int m_inoalign_mask;/* mask sb_inoalignmt if used */ uint m_qflags; /* quota status flags */ struct xfs_trans_resv m_resv; /* precomputed res values */ @@ -179,6 +181,8 @@ typedef struct xfs_mount { allocator */ #define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ +#define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */ + /* * Default minimum read and write sizes. diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 981a657eca39..ab4a6066f7ca 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -306,7 +306,7 @@ xfs_fs_commit_blocks( tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out_drop_iolock; } @@ -321,7 +321,7 @@ xfs_fs_commit_blocks( } xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); out_drop_iolock: xfs_iunlock(ip, XFS_IOLOCK_EXCL); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 5538468c7f63..eac9549efd52 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -756,7 +756,7 @@ xfs_qm_qino_alloc( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create, XFS_QM_QINOCREATE_SPACE_RES(mp), 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -764,8 +764,7 @@ xfs_qm_qino_alloc( error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed); if (error) { - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } } @@ -796,7 +795,7 @@ xfs_qm_qino_alloc( spin_unlock(&mp->m_sb_lock); xfs_log_sb(tp); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); xfs_alert(mp, "%s failed (error %d)!", __func__, error); diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 9a25c9275fb3..3640c6e896af 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -239,7 +239,7 @@ xfs_qm_scall_trunc_qfile( tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_IOLOCK_EXCL); goto out_put; } @@ -252,15 +252,14 @@ xfs_qm_scall_trunc_qfile( error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); if (error) { - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT); + xfs_trans_cancel(tp); goto out_unlock; } ASSERT(ip->i_d.di_nextents == 0); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); @@ -437,7 +436,7 @@ xfs_qm_scall_setqlim( tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out_rele; } @@ -548,7 +547,7 @@ xfs_qm_scall_setqlim( dqp->dq_flags |= XFS_DQ_DIRTY; xfs_trans_log_dquot(tp, dqp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); out_rele: xfs_qm_dqrele(dqp); @@ -571,7 +570,7 @@ xfs_qm_log_quotaoff_end( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -585,8 +584,7 @@ xfs_qm_log_quotaoff_end( * We don't care about quotoff's performance. */ xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); - return error; + return xfs_trans_commit(tp); } @@ -605,7 +603,7 @@ xfs_qm_log_quotaoff( tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out; } @@ -624,7 +622,7 @@ xfs_qm_log_quotaoff( * We don't care about quotoff's performance. */ xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) goto out; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 5376dd406ba2..ce6506adab7b 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -55,7 +55,6 @@ struct xfs_trans; typedef struct xfs_dqtrx { struct xfs_dquot *qt_dquot; /* the dquot this refers to */ ulong qt_blk_res; /* blks reserved on a dquot */ - ulong qt_blk_res_used; /* blks used from the reservation */ ulong qt_ino_res; /* inode reserved on a dquot */ ulong qt_ino_res_used; /* inodes used from the reservation */ long qt_bcount_delta; /* dquot blk count changes */ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index f2079b6911cc..ab1bac6a3a1c 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -757,32 +757,30 @@ xfs_rtallocate_extent_size( /* * Allocate space to the bitmap or summary file, and zero it, for growfs. */ -STATIC int /* error */ +STATIC int xfs_growfs_rt_alloc( - xfs_mount_t *mp, /* file system mount point */ - xfs_extlen_t oblocks, /* old count of blocks */ - xfs_extlen_t nblocks, /* new count of blocks */ - xfs_inode_t *ip) /* inode (bitmap/summary) */ + struct xfs_mount *mp, /* file system mount point */ + xfs_extlen_t oblocks, /* old count of blocks */ + xfs_extlen_t nblocks, /* new count of blocks */ + struct xfs_inode *ip) /* inode (bitmap/summary) */ { - xfs_fileoff_t bno; /* block number in file */ - xfs_buf_t *bp; /* temporary buffer for zeroing */ - int committed; /* transaction committed flag */ - xfs_daddr_t d; /* disk block address */ - int error; /* error return value */ - xfs_fsblock_t firstblock; /* first block allocated in xaction */ - xfs_bmap_free_t flist; /* list of freed blocks */ - xfs_fsblock_t fsbno; /* filesystem block for bno */ - xfs_bmbt_irec_t map; /* block map output */ - int nmap; /* number of block maps */ - int resblks; /* space reservation */ + xfs_fileoff_t bno; /* block number in file */ + struct xfs_buf *bp; /* temporary buffer for zeroing */ + int committed; /* transaction committed flag */ + xfs_daddr_t d; /* disk block address */ + int error; /* error return value */ + xfs_fsblock_t firstblock;/* first block allocated in xaction */ + struct xfs_bmap_free flist; /* list of freed blocks */ + xfs_fsblock_t fsbno; /* filesystem block for bno */ + struct xfs_bmbt_irec map; /* block map output */ + int nmap; /* number of block maps */ + int resblks; /* space reservation */ + struct xfs_trans *tp; /* * Allocate space to the file, as necessary. */ while (oblocks < nblocks) { - int cancelflags = 0; - xfs_trans_t *tp; - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks); /* @@ -791,8 +789,7 @@ xfs_growfs_rt_alloc( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc, resblks, 0); if (error) - goto error_cancel; - cancelflags = XFS_TRANS_RELEASE_LOG_RES; + goto out_trans_cancel; /* * Lock the inode. */ @@ -804,28 +801,26 @@ xfs_growfs_rt_alloc( * Allocate blocks to the bitmap file. */ nmap = 1; - cancelflags |= XFS_TRANS_ABORT; error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, XFS_BMAPI_METADATA, &firstblock, resblks, &map, &nmap, &flist); if (!error && nmap < 1) error = -ENOSPC; if (error) - goto error_cancel; + goto out_bmap_cancel; /* * Free any blocks freed up in the transaction, then commit. */ error = xfs_bmap_finish(&tp, &flist, &committed); if (error) - goto error_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + goto out_bmap_cancel; + error = xfs_trans_commit(tp); if (error) - goto error; + return error; /* * Now we need to clear the allocated blocks. * Do this one block per transaction, to keep it simple. */ - cancelflags = 0; for (bno = map.br_startoff, fsbno = map.br_startblock; bno < map.br_startoff + map.br_blockcount; bno++, fsbno++) { @@ -836,7 +831,7 @@ xfs_growfs_rt_alloc( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero, 0, 0); if (error) - goto error_cancel; + goto out_trans_cancel; /* * Lock the bitmap inode. */ @@ -850,27 +845,29 @@ xfs_growfs_rt_alloc( mp->m_bsize, 0); if (bp == NULL) { error = -EIO; -error_cancel: - xfs_trans_cancel(tp, cancelflags); - goto error; + goto out_trans_cancel; } memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); /* * Commit the transaction. */ - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) - goto error; + return error; } /* * Go on to the next extent, if any. */ oblocks = map.br_startoff + map.br_blockcount; } + return 0; -error: +out_bmap_cancel: + xfs_bmap_cancel(&flist); +out_trans_cancel: + xfs_trans_cancel(tp); return error; } @@ -973,7 +970,6 @@ xfs_growfs_rt( bmbno < nrbmblocks; bmbno++) { xfs_trans_t *tp; - int cancelflags = 0; *nmp = *mp; nsbp = &nmp->m_sb; @@ -1015,7 +1011,6 @@ xfs_growfs_rt( mp->m_rbmip->i_d.di_size = nsbp->sb_rbmblocks * nsbp->sb_blocksize; xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); - cancelflags |= XFS_TRANS_ABORT; /* * Get the summary inode into the transaction. */ @@ -1062,7 +1057,7 @@ xfs_growfs_rt( nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno); if (error) { error_cancel: - xfs_trans_cancel(tp, cancelflags); + xfs_trans_cancel(tp); break; } /* @@ -1076,7 +1071,7 @@ error_cancel: mp->m_rsumlevels = nrsumlevels; mp->m_rsumsize = nrsumsize; - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) break; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 858e1e62bbaa..904f637cfa5f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -112,6 +112,8 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #define MNTOPT_DISCARD "discard" /* Discard unused blocks */ #define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */ +#define MNTOPT_DAX "dax" /* Enable direct access to bdev pages */ + /* * Table driven mount option parser. * @@ -259,16 +261,8 @@ xfs_parseargs( mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL); if (!mp->m_rtname) return -ENOMEM; - } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { - if (!value || !*value) { - xfs_warn(mp, "%s option requires an argument", - this_char); - return -EINVAL; - } - if (kstrtoint(value, 10, &iosize)) - return -EINVAL; - iosizelog = ffs(iosize) - 1; - } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { + } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE) || + !strcmp(this_char, MNTOPT_BIOSIZE)) { if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", this_char); @@ -363,6 +357,10 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_DISCARD; } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { mp->m_flags &= ~XFS_MOUNT_DISCARD; +#ifdef CONFIG_FS_DAX + } else if (!strcmp(this_char, MNTOPT_DAX)) { + mp->m_flags |= XFS_MOUNT_DAX; +#endif } else { xfs_warn(mp, "unknown mount option [%s].", this_char); return -EINVAL; @@ -452,8 +450,8 @@ done: } struct proc_xfs_info { - int flag; - char *str; + uint64_t flag; + char *str; }; STATIC int @@ -474,6 +472,7 @@ xfs_showargs( { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE }, + { XFS_MOUNT_DAX, "," MNTOPT_DAX }, { 0, NULL } }; static struct proc_xfs_info xfs_info_unset[] = { @@ -504,9 +503,9 @@ xfs_showargs( seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10); if (mp->m_logname) - seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname); + seq_show_option(m, MNTOPT_LOGDEV, mp->m_logname); if (mp->m_rtname) - seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname); + seq_show_option(m, MNTOPT_RTDEV, mp->m_rtname); if (mp->m_dalign > 0) seq_printf(m, "," MNTOPT_SUNIT "=%d", @@ -1507,6 +1506,24 @@ xfs_fs_fill_super( if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) sb->s_flags |= MS_I_VERSION; + if (mp->m_flags & XFS_MOUNT_DAX) { + xfs_warn(mp, + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + if (sb->s_blocksize != PAGE_SIZE) { + xfs_alert(mp, + "Filesystem block size invalid for DAX Turning DAX off."); + mp->m_flags &= ~XFS_MOUNT_DAX; + } else if (!sb->s_bdev->bd_disk->fops->direct_access) { + xfs_alert(mp, + "Block device does not support DAX Turning DAX off."); + mp->m_flags &= ~XFS_MOUNT_DAX; + } + } + + if (xfs_sb_version_hassparseinodes(&mp->m_sb)) + xfs_alert(mp, + "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!"); + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 3df411eadb86..996481eeb491 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -104,7 +104,7 @@ xfs_readlink_bmap( cur_chunk += sizeof(struct xfs_dsymlink_hdr); } - memcpy(link + offset, bp->b_addr, byte_cnt); + memcpy(link + offset, cur_chunk, byte_cnt); pathlen -= byte_cnt; offset += byte_cnt; @@ -178,7 +178,6 @@ xfs_symlink( struct xfs_bmap_free free_list; xfs_fsblock_t first_block; bool unlock_dp_on_error = false; - uint cancel_flags; int committed; xfs_fileoff_t first_fsb; xfs_filblks_t fs_blocks; @@ -224,7 +223,6 @@ xfs_symlink( return error; tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* * The symlink will fit into the inode data fork? * There can't be any attributes so we get the whole variable part. @@ -239,12 +237,11 @@ xfs_symlink( resblks = 0; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0); } - if (error) { - cancel_flags = 0; + if (error) goto out_trans_cancel; - } - xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | + XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); unlock_dp_on_error = true; /* @@ -292,7 +289,7 @@ xfs_symlink( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); unlock_dp_on_error = false; /* @@ -394,7 +391,7 @@ xfs_symlink( if (error) goto out_bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto out_release_inode; @@ -407,9 +404,8 @@ xfs_symlink( out_bmap_cancel: xfs_bmap_cancel(&free_list); - cancel_flags |= XFS_TRANS_ABORT; out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); out_release_inode: /* * Wait until after the current transaction is aborted to finish the @@ -426,7 +422,7 @@ out_release_inode: xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); return error; } @@ -464,7 +460,7 @@ xfs_inactive_symlink_rmt( tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -506,7 +502,7 @@ xfs_inactive_symlink_rmt( /* * Unmap the dead block(s) to the free_list. */ - error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, + error = xfs_bunmapi(tp, ip, 0, size, 0, nmaps, &first_block, &free_list, &done); if (error) goto error_bmap_cancel; @@ -533,7 +529,7 @@ xfs_inactive_symlink_rmt( /* * Commit the transaction containing extent freeing and EFDs. */ - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); goto error_unlock; @@ -552,7 +548,7 @@ xfs_inactive_symlink_rmt( error_bmap_cancel: xfs_bmap_cancel(&free_list); error_trans_cancel: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); error_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 615781bf4ee5..5ed36b1e04c1 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -687,6 +687,7 @@ DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); DEFINE_INODE_EVENT(xfs_filemap_fault); +DEFINE_INODE_EVENT(xfs_filemap_pmd_fault); DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite); DECLARE_EVENT_CLASS(xfs_iref_class, @@ -738,6 +739,53 @@ TRACE_EVENT(xfs_iomap_prealloc_size, __entry->blocks, __entry->shift, __entry->writeio_blocks) ) +TRACE_EVENT(xfs_irec_merge_pre, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, + uint16_t holemask, xfs_agino_t nagino, uint16_t nholemask), + TP_ARGS(mp, agno, agino, holemask, nagino, nholemask), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(uint16_t, holemask) + __field(xfs_agino_t, nagino) + __field(uint16_t, nholemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agino = agino; + __entry->holemask = holemask; + __entry->nagino = nagino; + __entry->nholemask = holemask; + ), + TP_printk("dev %d:%d agno %d inobt (%u:0x%x) new (%u:0x%x)", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __entry->agino, __entry->holemask, __entry->nagino, + __entry->nholemask) +) + +TRACE_EVENT(xfs_irec_merge_post, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, + uint16_t holemask), + TP_ARGS(mp, agno, agino, holemask), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(uint16_t, holemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agino = agino; + __entry->holemask = holemask; + ), + TP_printk("dev %d:%d agno %d inobt (%u:0x%x)", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->agno, __entry->agino, + __entry->holemask) +) + #define DEFINE_IREF_EVENT(name) \ DEFINE_EVENT(xfs_iref_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ @@ -2042,6 +2090,40 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover); DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); +DECLARE_EVENT_CLASS(xfs_log_recover_icreate_item_class, + TP_PROTO(struct xlog *log, struct xfs_icreate_log *in_f), + TP_ARGS(log, in_f), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(unsigned int, count) + __field(unsigned int, isize) + __field(xfs_agblock_t, length) + __field(unsigned int, gen) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->agno = be32_to_cpu(in_f->icl_ag); + __entry->agbno = be32_to_cpu(in_f->icl_agbno); + __entry->count = be32_to_cpu(in_f->icl_count); + __entry->isize = be32_to_cpu(in_f->icl_isize); + __entry->length = be32_to_cpu(in_f->icl_length); + __entry->gen = be32_to_cpu(in_f->icl_gen); + ), + TP_printk("dev %d:%d agno %u agbno %u count %u isize %u length %u " + "gen %u", MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, __entry->agbno, __entry->count, __entry->isize, + __entry->length, __entry->gen) +) +#define DEFINE_LOG_RECOVER_ICREATE_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_icreate_item_class, name, \ + TP_PROTO(struct xlog *log, struct xfs_icreate_log *in_f), \ + TP_ARGS(log, in_f)) + +DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_cancel); +DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_recover); + DECLARE_EVENT_CLASS(xfs_discard_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t len), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 220ef2c906b2..a0ab1dae9c31 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -113,7 +113,7 @@ xfs_trans_free( * blocks. Locks and log items, however, are no inherited. They must * be added to the new transaction explicitly. */ -xfs_trans_t * +STATIC xfs_trans_t * xfs_trans_dup( xfs_trans_t *tp) { @@ -251,14 +251,7 @@ xfs_trans_reserve( */ undo_log: if (resp->tr_logres > 0) { - int log_flags; - - if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) { - log_flags = XFS_LOG_REL_PERM_RESERV; - } else { - log_flags = 0; - } - xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags); + xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false); tp->t_ticket = NULL; tp->t_log_res = 0; tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES; @@ -744,7 +737,7 @@ void xfs_trans_free_items( struct xfs_trans *tp, xfs_lsn_t commit_lsn, - int flags) + bool abort) { struct xfs_log_item_desc *lidp, *next; @@ -755,7 +748,7 @@ xfs_trans_free_items( if (commit_lsn != NULLCOMMITLSN) lip->li_ops->iop_committing(lip, commit_lsn); - if (flags & XFS_TRANS_ABORT) + if (abort) lip->li_flags |= XFS_LI_ABORTED; lip->li_ops->iop_unlock(lip); @@ -892,27 +885,17 @@ xfs_trans_committed_bulk( * have already been unlocked as if the commit had succeeded. * Do not reference the transaction structure after this call. */ -int -xfs_trans_commit( +static int +__xfs_trans_commit( struct xfs_trans *tp, - uint flags) + bool regrant) { struct xfs_mount *mp = tp->t_mountp; xfs_lsn_t commit_lsn = -1; int error = 0; - int log_flags = 0; int sync = tp->t_flags & XFS_TRANS_SYNC; /* - * Determine whether this commit is releasing a permanent - * log reservation or not. - */ - if (flags & XFS_TRANS_RELEASE_LOG_RES) { - ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - log_flags = XFS_LOG_REL_PERM_RESERV; - } - - /* * If there is nothing to be logged by the transaction, * then unlock all of the items associated with the * transaction and free the transaction structure. @@ -936,7 +919,7 @@ xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp); - xfs_log_commit_cil(mp, tp, &commit_lsn, flags); + xfs_log_commit_cil(mp, tp, &commit_lsn, regrant); current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); xfs_trans_free(tp); @@ -964,18 +947,25 @@ out_unreserve: */ xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant); if (commit_lsn == -1 && !error) error = -EIO; } current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0); + xfs_trans_free_items(tp, NULLCOMMITLSN, !!error); xfs_trans_free(tp); XFS_STATS_INC(xs_trans_empty); return error; } +int +xfs_trans_commit( + struct xfs_trans *tp) +{ + return __xfs_trans_commit(tp, false); +} + /* * Unlock all of the transaction's items and free the transaction. * The transaction must not have modified any of its items, because @@ -986,29 +976,22 @@ out_unreserve: */ void xfs_trans_cancel( - xfs_trans_t *tp, - int flags) + struct xfs_trans *tp) { - int log_flags; - xfs_mount_t *mp = tp->t_mountp; + struct xfs_mount *mp = tp->t_mountp; + bool dirty = (tp->t_flags & XFS_TRANS_DIRTY); /* - * See if the caller is being too lazy to figure out if - * the transaction really needs an abort. - */ - if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY)) - flags &= ~XFS_TRANS_ABORT; - /* * See if the caller is relying on us to shut down the * filesystem. This happens in paths where we detect * corruption and decide to give up. */ - if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) { + if (dirty && !XFS_FORCED_SHUTDOWN(mp)) { XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } #ifdef DEBUG - if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) { + if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) { struct xfs_log_item_desc *lidp; list_for_each_entry(lidp, &tp->t_items, lid_trans) @@ -1018,34 +1001,28 @@ xfs_trans_cancel( xfs_trans_unreserve_and_mod_sb(tp); xfs_trans_unreserve_and_mod_dquots(tp); - if (tp->t_ticket) { - if (flags & XFS_TRANS_RELEASE_LOG_RES) { - ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - log_flags = XFS_LOG_REL_PERM_RESERV; - } else { - log_flags = 0; - } - xfs_log_done(mp, tp->t_ticket, NULL, log_flags); - } + if (tp->t_ticket) + xfs_log_done(mp, tp->t_ticket, NULL, false); /* mark this thread as no longer being in a transaction */ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, NULLCOMMITLSN, flags); + xfs_trans_free_items(tp, NULLCOMMITLSN, dirty); xfs_trans_free(tp); } /* * Roll from one trans in the sequence of PERMANENT transactions to * the next: permanent transactions are only flushed out when - * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon + * committed with xfs_trans_commit(), but we still want as soon * as possible to let chunks of it go to the log. So we commit the * chunk we've been working on and get a new transaction to continue. */ int -xfs_trans_roll( +__xfs_trans_roll( struct xfs_trans **tpp, - struct xfs_inode *dp) + struct xfs_inode *dp, + int *committed) { struct xfs_trans *trans; struct xfs_trans_res tres; @@ -1055,7 +1032,8 @@ xfs_trans_roll( * Ensure that the inode is always logged. */ trans = *tpp; - xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); + if (dp) + xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); /* * Copy the critical parameters from one trans to the next. @@ -1071,20 +1049,14 @@ xfs_trans_roll( * is in progress. The caller takes the responsibility to cancel * the duplicate transaction that gets returned. */ - error = xfs_trans_commit(trans, 0); + error = __xfs_trans_commit(trans, true); if (error) return error; + *committed = 1; trans = *tpp; /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(trans->t_ticket); - - - /* * Reserve space in the log for th next transaction. * This also pushes items in the "AIL", the list of logged items, * out to disk if they are taking up space at the tail of the log @@ -1100,6 +1072,16 @@ xfs_trans_roll( if (error) return error; - xfs_trans_ijoin(trans, dp, 0); + if (dp) + xfs_trans_ijoin(trans, dp, 0); return 0; } + +int +xfs_trans_roll( + struct xfs_trans **tpp, + struct xfs_inode *dp) +{ + int committed = 0; + return __xfs_trans_roll(tpp, dp, &committed); +} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index b5bc1ab3c4da..4643070d7cae 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -133,8 +133,6 @@ typedef struct xfs_trans { * XFS transaction mechanism exported interfaces that are * actually macros. */ -#define xfs_trans_get_log_res(tp) ((tp)->t_log_res) -#define xfs_trans_get_log_count(tp) ((tp)->t_log_count) #define xfs_trans_get_block_res(tp) ((tp)->t_blk_res) #define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC) @@ -153,7 +151,6 @@ typedef struct xfs_trans { */ xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t); -xfs_trans_t *xfs_trans_dup(xfs_trans_t *); int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *, uint, uint); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); @@ -216,7 +213,6 @@ void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); -void xfs_efi_release(struct xfs_efi_log_item *, uint); void xfs_trans_log_efi_extent(xfs_trans_t *, struct xfs_efi_log_item *, xfs_fsblock_t, @@ -224,13 +220,13 @@ void xfs_trans_log_efi_extent(xfs_trans_t *, struct xfs_efd_log_item *xfs_trans_get_efd(xfs_trans_t *, struct xfs_efi_log_item *, uint); -void xfs_trans_log_efd_extent(xfs_trans_t *, - struct xfs_efd_log_item *, - xfs_fsblock_t, - xfs_extlen_t); -int xfs_trans_commit(xfs_trans_t *, uint flags); +int xfs_trans_free_extent(struct xfs_trans *, + struct xfs_efd_log_item *, xfs_fsblock_t, + xfs_extlen_t); +int xfs_trans_commit(struct xfs_trans *); +int __xfs_trans_roll(struct xfs_trans **, struct xfs_inode *, int *); int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); -void xfs_trans_cancel(xfs_trans_t *, int); +void xfs_trans_cancel(xfs_trans_t *); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 573aefb5a573..1098cf490189 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -159,7 +159,7 @@ xfs_trans_ail_cursor_next( { struct xfs_log_item *lip = cur->item; - if ((__psint_t)lip & 1) + if ((uintptr_t)lip & 1) lip = xfs_ail_min(ailp); if (lip) cur->item = xfs_ail_next(ailp, lip); @@ -196,7 +196,7 @@ xfs_trans_ail_cursor_clear( list_for_each_entry(cur, &ailp->xa_cursors, list) { if (cur->item == lip) cur->item = (struct xfs_log_item *) - ((__psint_t)cur->item | 1); + ((uintptr_t)cur->item | 1); } } @@ -287,7 +287,7 @@ xfs_ail_splice( * find the place in the AIL where the items belong. */ lip = cur ? cur->item : NULL; - if (!lip || (__psint_t) lip & 1) + if (!lip || (uintptr_t)lip & 1) lip = __xfs_trans_ail_cursor_last(ailp, lsn); /* diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 76a16df55ef7..ce78534a047e 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -90,8 +90,9 @@ xfs_trans_dup_dqinfo( xfs_trans_t *ntp) { xfs_dqtrx_t *oq, *nq; - int i,j; + int i, j; xfs_dqtrx_t *oqa, *nqa; + ulong blk_res_used; if (!otp->t_dqinfo) return; @@ -102,18 +103,23 @@ xfs_trans_dup_dqinfo( * Because the quota blk reservation is carried forward, * it is also necessary to carry forward the DQ_DIRTY flag. */ - if(otp->t_flags & XFS_TRANS_DQ_DIRTY) + if (otp->t_flags & XFS_TRANS_DQ_DIRTY) ntp->t_flags |= XFS_TRANS_DQ_DIRTY; for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { oqa = otp->t_dqinfo->dqs[j]; nqa = ntp->t_dqinfo->dqs[j]; for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + blk_res_used = 0; + if (oqa[i].qt_dquot == NULL) break; oq = &oqa[i]; nq = &nqa[i]; + if (oq->qt_blk_res && oq->qt_bcount_delta > 0) + blk_res_used = oq->qt_bcount_delta; + nq->qt_dquot = oq->qt_dquot; nq->qt_bcount_delta = nq->qt_icount_delta = 0; nq->qt_rtbcount_delta = 0; @@ -121,8 +127,8 @@ xfs_trans_dup_dqinfo( /* * Transfer whatever is left of the reservations. */ - nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used; - oq->qt_blk_res = oq->qt_blk_res_used; + nq->qt_blk_res = oq->qt_blk_res - blk_res_used; + oq->qt_blk_res = blk_res_used; nq->qt_rtblk_res = oq->qt_rtblk_res - oq->qt_rtblk_res_used; @@ -239,10 +245,6 @@ xfs_trans_mod_dquot( * disk blocks used. */ case XFS_TRANS_DQ_BCOUNT: - if (qtrx->qt_blk_res && delta > 0) { - qtrx->qt_blk_res_used += (ulong)delta; - ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used); - } qtrx->qt_bcount_delta += delta; break; @@ -423,15 +425,19 @@ xfs_trans_apply_dquot_deltas( * reservation that a transaction structure knows of. */ if (qtrx->qt_blk_res != 0) { - if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) { - if (qtrx->qt_blk_res > - qtrx->qt_blk_res_used) + ulong blk_res_used = 0; + + if (qtrx->qt_bcount_delta > 0) + blk_res_used = qtrx->qt_bcount_delta; + + if (qtrx->qt_blk_res != blk_res_used) { + if (qtrx->qt_blk_res > blk_res_used) dqp->q_res_bcount -= (xfs_qcnt_t) (qtrx->qt_blk_res - - qtrx->qt_blk_res_used); + blk_res_used); else dqp->q_res_bcount -= (xfs_qcnt_t) - (qtrx->qt_blk_res_used - + (blk_res_used - qtrx->qt_blk_res); } } else { diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index 284397dd7990..a96ae540eb62 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -25,6 +25,7 @@ #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_extfree_item.h" +#include "xfs_alloc.h" /* * This routine is called to allocate an "extent free intention" @@ -108,19 +109,30 @@ xfs_trans_get_efd(xfs_trans_t *tp, } /* - * This routine is called to indicate that the described - * extent is to be logged as having been freed. It should - * be called once for each extent freed. + * Free an extent and log it to the EFD. Note that the transaction is marked + * dirty regardless of whether the extent free succeeds or fails to support the + * EFI/EFD lifecycle rules. */ -void -xfs_trans_log_efd_extent(xfs_trans_t *tp, - xfs_efd_log_item_t *efdp, - xfs_fsblock_t start_block, - xfs_extlen_t ext_len) +int +xfs_trans_free_extent( + struct xfs_trans *tp, + struct xfs_efd_log_item *efdp, + xfs_fsblock_t start_block, + xfs_extlen_t ext_len) { uint next_extent; - xfs_extent_t *extp; + struct xfs_extent *extp; + int error; + error = xfs_free_extent(tp, start_block, ext_len); + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the EFI and frees the EFD + * 2.) shuts down the filesystem + */ tp->t_flags |= XFS_TRANS_DIRTY; efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY; @@ -130,4 +142,6 @@ xfs_trans_log_efd_extent(xfs_trans_t *tp, extp->ext_start = start_block; extp->ext_len = ext_len; efdp->efd_next_extent++; + + return error; } diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index bd1281862ad7..49931b72da8a 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -30,7 +30,7 @@ void xfs_trans_init(struct xfs_mount *); void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); void xfs_trans_del_item(struct xfs_log_item *); void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, - int flags); + bool abort); void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, @@ -119,6 +119,21 @@ xfs_trans_ail_delete( xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type); } +static inline void +xfs_trans_ail_remove( + struct xfs_log_item *lip, + int shutdown_type) +{ + struct xfs_ail *ailp = lip->li_ailp; + + spin_lock(&ailp->xa_lock); + /* xfs_trans_ail_delete() drops the AIL lock */ + if (lip->li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(ailp, lip, shutdown_type); + else + spin_unlock(&ailp->xa_lock); +} + void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); void xfs_ail_push_all(struct xfs_ail *); void xfs_ail_push_all_sync(struct xfs_ail *); |